summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/nhc.c8
-rw-r--r--net/802/fc.c2
-rw-r--r--net/802/fddi.c11
-rw-r--r--net/802/hippi.c16
-rw-r--r--net/8021q/vlan.c11
-rw-r--r--net/8021q/vlan.h2
-rw-r--r--net/8021q/vlan_dev.c10
-rw-r--r--net/9p/client.c20
-rw-r--r--net/Kconfig22
-rw-r--r--net/Makefile3
-rw-r--r--net/appletalk/ddp.c4
-rw-r--r--net/atm/br2684.c6
-rw-r--r--net/atm/common.c4
-rw-r--r--net/atm/lec.c19
-rw-r--r--net/atm/mpc.c2
-rw-r--r--net/atm/mpoa_caches.c43
-rw-r--r--net/atm/svc.c7
-rw-r--r--net/ax25/af_ax25.c7
-rw-r--r--net/ax25/ax25_addr.c2
-rw-r--r--net/ax25/ax25_dev.c2
-rw-r--r--net/ax25/ax25_ds_in.c2
-rw-r--r--net/ax25/ax25_ds_subr.c2
-rw-r--r--net/ax25/ax25_ds_timer.c2
-rw-r--r--net/ax25/ax25_iface.c2
-rw-r--r--net/ax25/ax25_in.c2
-rw-r--r--net/ax25/ax25_ip.c2
-rw-r--r--net/ax25/ax25_out.c2
-rw-r--r--net/ax25/ax25_route.c2
-rw-r--r--net/ax25/ax25_std_in.c2
-rw-r--r--net/ax25/ax25_std_subr.c2
-rw-r--r--net/ax25/ax25_std_timer.c2
-rw-r--r--net/ax25/ax25_subr.c4
-rw-r--r--net/ax25/ax25_timer.c2
-rw-r--r--net/ax25/ax25_uid.c2
-rw-r--r--net/batman-adv/Kconfig2
-rw-r--r--net/batman-adv/Makefile2
-rw-r--r--net/batman-adv/bat_algo.c2
-rw-r--r--net/batman-adv/bat_algo.h2
-rw-r--r--net/batman-adv/bat_iv_ogm.c71
-rw-r--r--net/batman-adv/bat_iv_ogm.h2
-rw-r--r--net/batman-adv/bat_v.c24
-rw-r--r--net/batman-adv/bat_v.h2
-rw-r--r--net/batman-adv/bat_v_elp.c73
-rw-r--r--net/batman-adv/bat_v_elp.h2
-rw-r--r--net/batman-adv/bat_v_ogm.c77
-rw-r--r--net/batman-adv/bat_v_ogm.h2
-rw-r--r--net/batman-adv/bitarray.c2
-rw-r--r--net/batman-adv/bitarray.h2
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c3
-rw-r--r--net/batman-adv/bridge_loop_avoidance.h20
-rw-r--r--net/batman-adv/debugfs.c30
-rw-r--r--net/batman-adv/debugfs.h2
-rw-r--r--net/batman-adv/distributed-arp-table.c87
-rw-r--r--net/batman-adv/distributed-arp-table.h2
-rw-r--r--net/batman-adv/fragmentation.c118
-rw-r--r--net/batman-adv/fragmentation.h4
-rw-r--r--net/batman-adv/gateway_client.c11
-rw-r--r--net/batman-adv/gateway_client.h2
-rw-r--r--net/batman-adv/gateway_common.c7
-rw-r--r--net/batman-adv/gateway_common.h2
-rw-r--r--net/batman-adv/hard-interface.c227
-rw-r--r--net/batman-adv/hard-interface.h23
-rw-r--r--net/batman-adv/hash.c2
-rw-r--r--net/batman-adv/hash.h32
-rw-r--r--net/batman-adv/icmp_socket.c7
-rw-r--r--net/batman-adv/icmp_socket.h2
-rw-r--r--net/batman-adv/log.c6
-rw-r--r--net/batman-adv/log.h14
-rw-r--r--net/batman-adv/main.c18
-rw-r--r--net/batman-adv/main.h30
-rw-r--r--net/batman-adv/multicast.c72
-rw-r--r--net/batman-adv/multicast.h8
-rw-r--r--net/batman-adv/netlink.c33
-rw-r--r--net/batman-adv/netlink.h2
-rw-r--r--net/batman-adv/network-coding.c45
-rw-r--r--net/batman-adv/network-coding.h2
-rw-r--r--net/batman-adv/originator.c25
-rw-r--r--net/batman-adv/originator.h2
-rw-r--r--net/batman-adv/packet.h14
-rw-r--r--net/batman-adv/routing.c181
-rw-r--r--net/batman-adv/routing.h2
-rw-r--r--net/batman-adv/send.c419
-rw-r--r--net/batman-adv/send.h13
-rw-r--r--net/batman-adv/soft-interface.c35
-rw-r--r--net/batman-adv/soft-interface.h2
-rw-r--r--net/batman-adv/sysfs.c55
-rw-r--r--net/batman-adv/sysfs.h2
-rw-r--r--net/batman-adv/tp_meter.c10
-rw-r--r--net/batman-adv/tp_meter.h2
-rw-r--r--net/batman-adv/translation-table.c43
-rw-r--r--net/batman-adv/translation-table.h2
-rw-r--r--net/batman-adv/tvlv.c7
-rw-r--r--net/batman-adv/tvlv.h2
-rw-r--r--net/batman-adv/types.h51
-rw-r--r--net/bluetooth/6lowpan.c2
-rw-r--r--net/bluetooth/Makefile2
-rw-r--r--net/bluetooth/a2mp.c4
-rw-r--r--net/bluetooth/af_bluetooth.c4
-rw-r--r--net/bluetooth/amp.c4
-rw-r--r--net/bluetooth/bnep/netdev.c3
-rw-r--r--net/bluetooth/cmtp/capi.c2
-rw-r--r--net/bluetooth/hci_event.c2
-rw-r--r--net/bluetooth/hci_request.c2
-rw-r--r--net/bluetooth/hci_sock.c6
-rw-r--r--net/bluetooth/l2cap_core.c10
-rw-r--r--net/bluetooth/l2cap_sock.c3
-rw-r--r--net/bluetooth/rfcomm/sock.c4
-rw-r--r--net/bluetooth/sco.c3
-rw-r--r--net/bluetooth/smp.c85
-rw-r--r--net/bluetooth/smp.h1
-rw-r--r--net/bridge/Makefile5
-rw-r--r--net/bridge/br_device.c38
-rw-r--r--net/bridge/br_fdb.c225
-rw-r--r--net/bridge/br_forward.c44
-rw-r--r--net/bridge/br_if.c3
-rw-r--r--net/bridge/br_input.c18
-rw-r--r--net/bridge/br_ioctl.c4
-rw-r--r--net/bridge/br_mdb.c2
-rw-r--r--net/bridge/br_multicast.c340
-rw-r--r--net/bridge/br_netfilter_hooks.c63
-rw-r--r--net/bridge/br_netfilter_ipv6.c2
-rw-r--r--net/bridge/br_netlink.c218
-rw-r--r--net/bridge/br_netlink_tunnel.c294
-rw-r--r--net/bridge/br_private.h93
-rw-r--r--net/bridge/br_private_stp.h1
-rw-r--r--net/bridge/br_private_tunnel.h83
-rw-r--r--net/bridge/br_stp.c67
-rw-r--r--net/bridge/br_stp_if.c18
-rw-r--r--net/bridge/br_stp_timer.c4
-rw-r--r--net/bridge/br_sysfs_br.c43
-rw-r--r--net/bridge/br_sysfs_if.c3
-rw-r--r--net/bridge/br_vlan.c24
-rw-r--r--net/bridge/br_vlan_tunnel.c205
-rw-r--r--net/bridge/netfilter/Kconfig1
-rw-r--r--net/bridge/netfilter/ebt_among.c2
-rw-r--r--net/bridge/netfilter/ebt_arpreply.c3
-rw-r--r--net/bridge/netfilter/ebt_limit.c1
-rw-r--r--net/bridge/netfilter/ebt_log.c13
-rw-r--r--net/bridge/netfilter/ebt_nflog.c6
-rw-r--r--net/bridge/netfilter/ebt_redirect.c6
-rw-r--r--net/bridge/netfilter/ebtable_broute.c2
-rw-r--r--net/bridge/netfilter/ebtables.c86
-rw-r--r--net/bridge/netfilter/nf_log_bridge.c17
-rw-r--r--net/bridge/netfilter/nft_meta_bridge.c2
-rw-r--r--net/bridge/netfilter/nft_reject_bridge.c30
-rw-r--r--net/caif/caif_dev.c2
-rw-r--r--net/caif/caif_socket.c2
-rw-r--r--net/caif/cfcnfg.c9
-rw-r--r--net/caif/chnl_net.c1
-rw-r--r--net/can/af_can.c12
-rw-r--r--net/can/af_can.h3
-rw-r--r--net/can/bcm.c59
-rw-r--r--net/can/gw.c4
-rw-r--r--net/can/raw.c4
-rw-r--r--net/ceph/auth.c4
-rw-r--r--net/ceph/auth_x.c197
-rw-r--r--net/ceph/auth_x.h3
-rw-r--r--net/ceph/ceph_common.c15
-rw-r--r--net/ceph/cls_lock_client.c14
-rw-r--r--net/ceph/crush/crush.c5
-rw-r--r--net/ceph/crush/mapper.c229
-rw-r--r--net/ceph/crypto.c465
-rw-r--r--net/ceph/crypto.h26
-rw-r--r--net/ceph/messenger.c73
-rw-r--r--net/ceph/mon_client.c12
-rw-r--r--net/ceph/osd_client.c199
-rw-r--r--net/ceph/osdmap.c101
-rw-r--r--net/ceph/snapshot.c2
-rw-r--r--net/compat.c36
-rw-r--r--net/core/Makefile2
-rw-r--r--net/core/datagram.c101
-rw-r--r--net/core/dev.c1433
-rw-r--r--net/core/devlink.c122
-rw-r--r--net/core/drop_monitor.c60
-rw-r--r--net/core/dst.c1
-rw-r--r--net/core/ethtool.c150
-rw-r--r--net/core/fib_rules.c78
-rw-r--r--net/core/filter.c562
-rw-r--r--net/core/flow.c60
-rw-r--r--net/core/flow_dissector.c95
-rw-r--r--net/core/gen_estimator.c296
-rw-r--r--net/core/gen_stats.c20
-rw-r--r--net/core/gro_cells.c92
-rw-r--r--net/core/lwt_bpf.c397
-rw-r--r--net/core/lwtunnel.c85
-rw-r--r--net/core/neighbour.c22
-rw-r--r--net/core/net-sysfs.c66
-rw-r--r--net/core/net_namespace.c79
-rw-r--r--net/core/netclassid_cgroup.c34
-rw-r--r--net/core/netpoll.c16
-rw-r--r--net/core/netprio_cgroup.c3
-rw-r--r--net/core/pktgen.c6
-rw-r--r--net/core/request_sock.c2
-rw-r--r--net/core/rtnetlink.c106
-rw-r--r--net/core/scm.c5
-rw-r--r--net/core/secure_seq.c177
-rw-r--r--net/core/skbuff.c172
-rw-r--r--net/core/sock.c239
-rw-r--r--net/core/stream.c29
-rw-r--r--net/core/sysctl_net_core.c49
-rw-r--r--net/core/utils.c2
-rw-r--r--net/dccp/ccids/ccid2.c1
-rw-r--r--net/dccp/input.c13
-rw-r--r--net/dccp/ipv4.c18
-rw-r--r--net/dccp/ipv6.c22
-rw-r--r--net/dccp/minisocks.c29
-rw-r--r--net/dccp/output.c1
-rw-r--r--net/decnet/af_decnet.c23
-rw-r--r--net/decnet/dn_dev.c4
-rw-r--r--net/decnet/dn_fib.c2
-rw-r--r--net/decnet/dn_table.c2
-rw-r--r--net/decnet/sysctl_net_decnet.c2
-rw-r--r--net/dns_resolver/dns_query.c6
-rw-r--r--net/dsa/Kconfig16
-rw-r--r--net/dsa/Makefile2
-rw-r--r--net/dsa/dsa.c262
-rw-r--r--net/dsa/dsa2.c254
-rw-r--r--net/dsa/dsa_priv.h25
-rw-r--r--net/dsa/slave.c497
-rw-r--r--net/dsa/switch.c85
-rw-r--r--net/dsa/tag_brcm.c11
-rw-r--r--net/dsa/tag_dsa.c10
-rw-r--r--net/dsa/tag_edsa.c10
-rw-r--r--net/dsa/tag_qca.c4
-rw-r--r--net/dsa/tag_trailer.c6
-rw-r--r--net/ethernet/eth.c39
-rw-r--r--net/hsr/hsr_device.c3
-rw-r--r--net/hsr/hsr_netlink.c23
-rw-r--r--net/hsr/hsr_slave.c3
-rw-r--r--net/ieee802154/6lowpan/6lowpan_i.h2
-rw-r--r--net/ieee802154/Makefile2
-rw-r--r--net/ieee802154/netlink.c24
-rw-r--r--net/ieee802154/nl-phy.c6
-rw-r--r--net/ieee802154/nl802154.c44
-rw-r--r--net/ieee802154/socket.c4
-rw-r--r--net/ife/Kconfig16
-rw-r--r--net/ife/Makefile5
-rw-r--r--net/ife/ife.c142
-rw-r--r--net/ipv4/Kconfig22
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/af_inet.c67
-rw-r--r--net/ipv4/ah4.c3
-rw-r--r--net/ipv4/arp.c12
-rw-r--r--net/ipv4/cipso_ipv4.c4
-rw-r--r--net/ipv4/devinet.c5
-rw-r--r--net/ipv4/esp4.c332
-rw-r--r--net/ipv4/esp4_offload.c106
-rw-r--r--net/ipv4/fib_frontend.c25
-rw-r--r--net/ipv4/fib_semantics.c85
-rw-r--r--net/ipv4/fib_trie.c195
-rw-r--r--net/ipv4/fou.c23
-rw-r--r--net/ipv4/icmp.c133
-rw-r--r--net/ipv4/igmp.c10
-rw-r--r--net/ipv4/inet_connection_sock.c284
-rw-r--r--net/ipv4/inet_diag.c75
-rw-r--r--net/ipv4/inet_hashtables.c19
-rw-r--r--net/ipv4/inet_timewait_sock.c3
-rw-r--r--net/ipv4/ip_fragment.c25
-rw-r--r--net/ipv4/ip_gre.c6
-rw-r--r--net/ipv4/ip_options.c2
-rw-r--r--net/ipv4/ip_output.c53
-rw-r--r--net/ipv4/ip_sockglue.c80
-rw-r--r--net/ipv4/ip_tunnel.c10
-rw-r--r--net/ipv4/ip_tunnel_core.c12
-rw-r--r--net/ipv4/ip_vti.c2
-rw-r--r--net/ipv4/ipconfig.c4
-rw-r--r--net/ipv4/ipip.c4
-rw-r--r--net/ipv4/ipmr.c296
-rw-r--r--net/ipv4/netfilter.c7
-rw-r--r--net/ipv4/netfilter/Kconfig14
-rw-r--r--net/ipv4/netfilter/Makefile3
-rw-r--r--net/ipv4/netfilter/arp_tables.c59
-rw-r--r--net/ipv4/netfilter/ip_tables.c65
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c44
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c11
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c4
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c19
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c18
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c151
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c15
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c45
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c7
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c4
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c5
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c20
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_socket_ipv4.c163
-rw-r--r--net/ipv4/netfilter/nft_dup_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c236
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c23
-rw-r--r--net/ipv4/netfilter/nft_redir_ipv4.c22
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c4
-rw-r--r--net/ipv4/ping.c27
-rw-r--r--net/ipv4/proc.c5
-rw-r--r--net/ipv4/raw.c43
-rw-r--r--net/ipv4/raw_diag.c266
-rw-r--r--net/ipv4/route.c172
-rw-r--r--net/ipv4/syncookies.c24
-rw-r--r--net/ipv4/sysctl_net_ipv4.c126
-rw-r--r--net/ipv4/tcp.c220
-rw-r--r--net/ipv4/tcp_bbr.c32
-rw-r--r--net/ipv4/tcp_cdg.c2
-rw-r--r--net/ipv4/tcp_cong.c14
-rw-r--r--net/ipv4/tcp_dctcp.c1
-rw-r--r--net/ipv4/tcp_fastopen.c57
-rw-r--r--net/ipv4/tcp_highspeed.c11
-rw-r--r--net/ipv4/tcp_hybla.c1
-rw-r--r--net/ipv4/tcp_illinois.c10
-rw-r--r--net/ipv4/tcp_input.c349
-rw-r--r--net/ipv4/tcp_ipv4.c90
-rw-r--r--net/ipv4/tcp_lp.c1
-rw-r--r--net/ipv4/tcp_metrics.c32
-rw-r--r--net/ipv4/tcp_minisocks.c30
-rw-r--r--net/ipv4/tcp_output.c339
-rw-r--r--net/ipv4/tcp_recovery.c149
-rw-r--r--net/ipv4/tcp_scalable.c15
-rw-r--r--net/ipv4/tcp_timer.c15
-rw-r--r--net/ipv4/tcp_vegas.c1
-rw-r--r--net/ipv4/tcp_veno.c10
-rw-r--r--net/ipv4/tcp_westwood.c1
-rw-r--r--net/ipv4/tcp_yeah.c10
-rw-r--r--net/ipv4/udp.c373
-rw-r--r--net/ipv4/udplite.c3
-rw-r--r--net/ipv4/xfrm4_input.c6
-rw-r--r--net/ipv4/xfrm4_mode_transport.c4
-rw-r--r--net/ipv4/xfrm4_policy.c9
-rw-r--r--net/ipv4/xfrm4_protocol.c3
-rw-r--r--net/ipv4/xfrm4_state.c8
-rw-r--r--net/ipv6/Kconfig49
-rw-r--r--net/ipv6/Makefile5
-rw-r--r--net/ipv6/addrconf.c206
-rw-r--r--net/ipv6/af_inet6.c33
-rw-r--r--net/ipv6/ah6.c8
-rw-r--r--net/ipv6/datagram.c34
-rw-r--r--net/ipv6/esp6.c323
-rw-r--r--net/ipv6/esp6_offload.c108
-rw-r--r--net/ipv6/exthdrs.c244
-rw-r--r--net/ipv6/icmp.c77
-rw-r--r--net/ipv6/ila/ila_lwt.c95
-rw-r--r--net/ipv6/ila/ila_xlat.c43
-rw-r--r--net/ipv6/inet6_connection_sock.c43
-rw-r--r--net/ipv6/inet6_hashtables.c46
-rw-r--r--net/ipv6/ip6_fib.c24
-rw-r--r--net/ipv6/ip6_flowlabel.c2
-rw-r--r--net/ipv6/ip6_gre.c57
-rw-r--r--net/ipv6/ip6_input.c7
-rw-r--r--net/ipv6/ip6_offload.c7
-rw-r--r--net/ipv6/ip6_output.c47
-rw-r--r--net/ipv6/ip6_tunnel.c54
-rw-r--r--net/ipv6/ip6_vti.c46
-rw-r--r--net/ipv6/ip6mr.c41
-rw-r--r--net/ipv6/ipcomp6.c5
-rw-r--r--net/ipv6/ipv6_sockglue.c36
-rw-r--r--net/ipv6/mcast.c50
-rw-r--r--net/ipv6/mip6.c2
-rw-r--r--net/ipv6/ndisc.c29
-rw-r--r--net/ipv6/netfilter.c1
-rw-r--r--net/ipv6/netfilter/Kconfig14
-rw-r--r--net/ipv6/netfilter/Makefile3
-rw-r--r--net/ipv6/netfilter/ip6_tables.c66
-rw-r--r--net/ipv6/netfilter/ip6t_MASQUERADE.c2
-rw-r--r--net/ipv6/netfilter/ip6t_NPT.c2
-rw-r--r--net/ipv6/netfilter/ip6t_REJECT.c23
-rw-r--r--net/ipv6/netfilter/ip6t_SYNPROXY.c19
-rw-r--r--net/ipv6/netfilter/ip6t_rpfilter.c11
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c146
-rw-r--r--net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c20
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c1
-rw-r--r--net/ipv6/netfilter/nf_defrag_ipv6_hooks.c46
-rw-r--r--net/ipv6/netfilter/nf_dup_ipv6.c7
-rw-r--r--net/ipv6/netfilter/nf_log_ipv6.c4
-rw-r--r--net/ipv6/netfilter/nf_reject_ipv6.c3
-rw-r--r--net/ipv6/netfilter/nf_socket_ipv6.c151
-rw-r--r--net/ipv6/netfilter/nft_dup_ipv6.c2
-rw-r--r--net/ipv6/netfilter/nft_fib_ipv6.c270
-rw-r--r--net/ipv6/netfilter/nft_masq_ipv6.c22
-rw-r--r--net/ipv6/netfilter/nft_redir_ipv6.c22
-rw-r--r--net/ipv6/netfilter/nft_reject_ipv6.c6
-rw-r--r--net/ipv6/ping.c8
-rw-r--r--net/ipv6/raw.c20
-rw-r--r--net/ipv6/reassembly.c7
-rw-r--r--net/ipv6/route.c401
-rw-r--r--net/ipv6/seg6.c500
-rw-r--r--net/ipv6/seg6_hmac.c448
-rw-r--r--net/ipv6/seg6_iptunnel.c436
-rw-r--r--net/ipv6/sit.c19
-rw-r--r--net/ipv6/syncookies.c43
-rw-r--r--net/ipv6/tcp_ipv6.c96
-rw-r--r--net/ipv6/udp.c88
-rw-r--r--net/ipv6/udplite.c3
-rw-r--r--net/ipv6/xfrm6_input.c22
-rw-r--r--net/ipv6/xfrm6_mode_transport.c4
-rw-r--r--net/ipv6/xfrm6_policy.c9
-rw-r--r--net/ipv6/xfrm6_protocol.c3
-rw-r--r--net/ipv6/xfrm6_tunnel.c2
-rw-r--r--net/ipx/af_ipx.c4
-rw-r--r--net/irda/af_irda.c8
-rw-r--r--net/irda/ircomm/ircomm_tty.c4
-rw-r--r--net/irda/ircomm/ircomm_tty_ioctl.c2
-rw-r--r--net/irda/irda_device.c2
-rw-r--r--net/irda/irlan/irlan_eth.c4
-rw-r--r--net/irda/irnet/irnet.h3
-rw-r--r--net/irda/irnet/irnet_ppp.c15
-rw-r--r--net/irda/irnet/irnet_ppp.h11
-rw-r--r--net/irda/irnetlink.c22
-rw-r--r--net/irda/irproc.c1
-rw-r--r--net/irda/irqueue.c34
-rw-r--r--net/iucv/af_iucv.c63
-rw-r--r--net/iucv/iucv.c124
-rw-r--r--net/kcm/kcmsock.c54
-rw-r--r--net/key/af_key.c95
-rw-r--r--net/l2tp/l2tp_core.c170
-rw-r--r--net/l2tp/l2tp_core.h20
-rw-r--r--net/l2tp/l2tp_debugfs.c10
-rw-r--r--net/l2tp/l2tp_eth.c18
-rw-r--r--net/l2tp/l2tp_ip.c97
-rw-r--r--net/l2tp/l2tp_ip6.c79
-rw-r--r--net/l2tp/l2tp_netlink.c111
-rw-r--r--net/l2tp/l2tp_ppp.c163
-rw-r--r--net/lapb/lapb_iface.c2
-rw-r--r--net/lapb/lapb_in.c2
-rw-r--r--net/lapb/lapb_out.c2
-rw-r--r--net/lapb/lapb_subr.c2
-rw-r--r--net/lapb/lapb_timer.c2
-rw-r--r--net/llc/af_llc.c30
-rw-r--r--net/llc/llc_conn.c3
-rw-r--r--net/llc/llc_sap.c3
-rw-r--r--net/mac80211/Kconfig1
-rw-r--r--net/mac80211/Makefile3
-rw-r--r--net/mac80211/aes_cmac.c126
-rw-r--r--net/mac80211/aes_cmac.h11
-rw-r--r--net/mac80211/agg-rx.c11
-rw-r--r--net/mac80211/cfg.c51
-rw-r--r--net/mac80211/chan.c7
-rw-r--r--net/mac80211/debugfs.c36
-rw-r--r--net/mac80211/debugfs_netdev.c14
-rw-r--r--net/mac80211/debugfs_sta.c10
-rw-r--r--net/mac80211/fils_aead.c334
-rw-r--r--net/mac80211/fils_aead.h19
-rw-r--r--net/mac80211/ibss.c4
-rw-r--r--net/mac80211/ieee80211_i.h38
-rw-r--r--net/mac80211/iface.c59
-rw-r--r--net/mac80211/key.c3
-rw-r--r--net/mac80211/key.h2
-rw-r--r--net/mac80211/main.c18
-rw-r--r--net/mac80211/mesh.c11
-rw-r--r--net/mac80211/mesh.h2
-rw-r--r--net/mac80211/mesh_plink.c16
-rw-r--r--net/mac80211/mesh_sync.c27
-rw-r--r--net/mac80211/mlme.c94
-rw-r--r--net/mac80211/pm.c1
-rw-r--r--net/mac80211/rc80211_minstrel.c21
-rw-r--r--net/mac80211/rc80211_minstrel.h33
-rw-r--r--net/mac80211/rc80211_minstrel_debugfs.c24
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c68
-rw-r--r--net/mac80211/rc80211_minstrel_ht.h6
-rw-r--r--net/mac80211/rc80211_minstrel_ht_debugfs.c32
-rw-r--r--net/mac80211/rx.c172
-rw-r--r--net/mac80211/scan.c8
-rw-r--r--net/mac80211/sta_info.c59
-rw-r--r--net/mac80211/sta_info.h12
-rw-r--r--net/mac80211/status.c17
-rw-r--r--net/mac80211/trace.h27
-rw-r--r--net/mac80211/tx.c211
-rw-r--r--net/mac80211/util.c61
-rw-r--r--net/mac80211/vht.c8
-rw-r--r--net/mac80211/wep.c3
-rw-r--r--net/mac80211/wme.c23
-rw-r--r--net/mac80211/wpa.c5
-rw-r--r--net/mac802154/Makefile2
-rw-r--r--net/mac802154/llsec.c2
-rw-r--r--net/mac802154/util.c4
-rw-r--r--net/mpls/af_mpls.c452
-rw-r--r--net/mpls/internal.h58
-rw-r--r--net/mpls/mpls_iptunnel.c19
-rw-r--r--net/netfilter/Kconfig70
-rw-r--r--net/netfilter/Makefile23
-rw-r--r--net/netfilter/core.c102
-rw-r--r--net/netfilter/ipset/Kconfig9
-rw-r--r--net/netfilter/ipset/Makefile1
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_gen.h31
-rw-r--r--net/netfilter/ipset/ip_set_core.c22
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h256
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmac.c315
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmark.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c6
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c6
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c8
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c8
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c10
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c46
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c42
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c54
-rw-r--r--net/netfilter/nf_conntrack_core.c144
-rw-r--r--net/netfilter/nf_conntrack_ecache.c2
-rw-r--r--net/netfilter/nf_conntrack_expect.c18
-rw-r--r--net/netfilter/nf_conntrack_extend.c13
-rw-r--r--net/netfilter/nf_conntrack_ftp.c2
-rw-r--r--net/netfilter/nf_conntrack_helper.c56
-rw-r--r--net/netfilter/nf_conntrack_netlink.c75
-rw-r--r--net/netfilter/nf_conntrack_proto.c158
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c102
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c13
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c132
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c1
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c124
-rw-r--r--net/netfilter/nf_conntrack_proto_udplite.c409
-rw-r--r--net/netfilter/nf_conntrack_sip.c14
-rw-r--r--net/netfilter/nf_conntrack_standalone.c13
-rw-r--r--net/netfilter/nf_dup_netdev.c35
-rw-r--r--net/netfilter/nf_internals.h5
-rw-r--r--net/netfilter/nf_log.c25
-rw-r--r--net/netfilter/nf_log_common.c28
-rw-r--r--net/netfilter/nf_log_netdev.c81
-rw-r--r--net/netfilter/nf_nat_core.c14
-rw-r--r--net/netfilter/nf_nat_helper.c2
-rw-r--r--net/netfilter/nf_nat_proto_dccp.c36
-rw-r--r--net/netfilter/nf_nat_proto_sctp.c53
-rw-r--r--net/netfilter/nf_nat_proto_udp.c78
-rw-r--r--net/netfilter/nf_nat_proto_udplite.c106
-rw-r--r--net/netfilter/nf_nat_redirect.c2
-rw-r--r--net/netfilter/nf_queue.c36
-rw-r--r--net/netfilter/nf_synproxy_core.c2
-rw-r--r--net/netfilter/nf_tables_api.c996
-rw-r--r--net/netfilter/nf_tables_core.c91
-rw-r--r--net/netfilter/nf_tables_trace.c8
-rw-r--r--net/netfilter/nfnetlink.c94
-rw-r--r--net/netfilter/nfnetlink_cthelper.c289
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c2
-rw-r--r--net/netfilter/nfnetlink_log.c7
-rw-r--r--net/netfilter/nfnetlink_queue.c19
-rw-r--r--net/netfilter/nft_bitwise.c13
-rw-r--r--net/netfilter/nft_byteorder.c13
-rw-r--r--net/netfilter/nft_cmp.c16
-rw-r--r--net/netfilter/nft_counter.c223
-rw-r--r--net/netfilter/nft_ct.c306
-rw-r--r--net/netfilter/nft_dynset.c16
-rw-r--r--net/netfilter/nft_exthdr.c139
-rw-r--r--net/netfilter/nft_fib.c159
-rw-r--r--net/netfilter/nft_fib_inet.c82
-rw-r--r--net/netfilter/nft_fwd_netdev.c4
-rw-r--r--net/netfilter/nft_hash.c12
-rw-r--r--net/netfilter/nft_immediate.c16
-rw-r--r--net/netfilter/nft_log.c8
-rw-r--r--net/netfilter/nft_lookup.c21
-rw-r--r--net/netfilter/nft_masq.c6
-rw-r--r--net/netfilter/nft_meta.c74
-rw-r--r--net/netfilter/nft_nat.c19
-rw-r--r--net/netfilter/nft_numgen.c2
-rw-r--r--net/netfilter/nft_objref.c228
-rw-r--r--net/netfilter/nft_payload.c145
-rw-r--r--net/netfilter/nft_queue.c4
-rw-r--r--net/netfilter/nft_quota.c160
-rw-r--r--net/netfilter/nft_range.c13
-rw-r--r--net/netfilter/nft_redir.c6
-rw-r--r--net/netfilter/nft_reject_inet.c18
-rw-r--r--net/netfilter/nft_rt.c153
-rw-r--r--net/netfilter/nft_set_bitmap.c307
-rw-r--r--net/netfilter/nft_set_hash.c35
-rw-r--r--net/netfilter/nft_set_rbtree.c31
-rw-r--r--net/netfilter/x_tables.c130
-rw-r--r--net/netfilter/xt_AUDIT.c10
-rw-r--r--net/netfilter/xt_CONNSECMARK.c4
-rw-r--r--net/netfilter/xt_CT.c21
-rw-r--r--net/netfilter/xt_LOG.c6
-rw-r--r--net/netfilter/xt_NETMAP.c31
-rw-r--r--net/netfilter/xt_NFLOG.c6
-rw-r--r--net/netfilter/xt_NFQUEUE.c4
-rw-r--r--net/netfilter/xt_RATEEST.c5
-rw-r--r--net/netfilter/xt_REDIRECT.c16
-rw-r--r--net/netfilter/xt_TCPMSS.c10
-rw-r--r--net/netfilter/xt_TEE.c6
-rw-r--r--net/netfilter/xt_TPROXY.c36
-rw-r--r--net/netfilter/xt_addrtype.c10
-rw-r--r--net/netfilter/xt_bpf.c98
-rw-r--r--net/netfilter/xt_cgroup.c1
-rw-r--r--net/netfilter/xt_cluster.c2
-rw-r--r--net/netfilter/xt_connbytes.c4
-rw-r--r--net/netfilter/xt_connlabel.c6
-rw-r--r--net/netfilter/xt_connlimit.c19
-rw-r--r--net/netfilter/xt_connmark.c8
-rw-r--r--net/netfilter/xt_conntrack.c12
-rw-r--r--net/netfilter/xt_devgroup.c4
-rw-r--r--net/netfilter/xt_dscp.c2
-rw-r--r--net/netfilter/xt_hashlimit.c31
-rw-r--r--net/netfilter/xt_helper.c4
-rw-r--r--net/netfilter/xt_ipvs.c4
-rw-r--r--net/netfilter/xt_limit.c2
-rw-r--r--net/netfilter/xt_multiport.c52
-rw-r--r--net/netfilter/xt_nat.c18
-rw-r--r--net/netfilter/xt_nfacct.c2
-rw-r--r--net/netfilter/xt_osf.c10
-rw-r--r--net/netfilter/xt_owner.c4
-rw-r--r--net/netfilter/xt_pkttype.c5
-rw-r--r--net/netfilter/xt_policy.c4
-rw-r--r--net/netfilter/xt_quota.c1
-rw-r--r--net/netfilter/xt_rateest.c29
-rw-r--r--net/netfilter/xt_recent.c12
-rw-r--r--net/netfilter/xt_set.c38
-rw-r--r--net/netfilter/xt_socket.c336
-rw-r--r--net/netfilter/xt_state.c4
-rw-r--r--net/netfilter/xt_string.c1
-rw-r--r--net/netfilter/xt_time.c2
-rw-r--r--net/netlabel/netlabel_calipso.c21
-rw-r--r--net/netlabel/netlabel_cipso_v4.c22
-rw-r--r--net/netlabel/netlabel_kapi.c5
-rw-r--r--net/netlabel/netlabel_mgmt.c21
-rw-r--r--net/netlabel/netlabel_unlabeled.c21
-rw-r--r--net/netlink/af_netlink.c57
-rw-r--r--net/netlink/genetlink.c325
-rw-r--r--net/netrom/af_netrom.c5
-rw-r--r--net/nfc/llcp_sock.c3
-rw-r--r--net/nfc/netlink.c34
-rw-r--r--net/openvswitch/actions.c176
-rw-r--r--net/openvswitch/conntrack.c326
-rw-r--r--net/openvswitch/conntrack.h14
-rw-r--r--net/openvswitch/datapath.c34
-rw-r--r--net/openvswitch/datapath.h2
-rw-r--r--net/openvswitch/flow.c153
-rw-r--r--net/openvswitch/flow.h77
-rw-r--r--net/openvswitch/flow_netlink.c274
-rw-r--r--net/openvswitch/flow_netlink.h7
-rw-r--r--net/openvswitch/vport-internal_dev.c16
-rw-r--r--net/openvswitch/vport-netdev.c10
-rw-r--r--net/openvswitch/vport.c48
-rw-r--r--net/openvswitch/vport.h3
-rw-r--r--net/packet/af_packet.c213
-rw-r--r--net/packet/diag.c3
-rw-r--r--net/phonet/pep-gprs.c12
-rw-r--r--net/phonet/pep.c16
-rw-r--r--net/phonet/pn_dev.c2
-rw-r--r--net/phonet/socket.c6
-rw-r--r--net/psample/Kconfig15
-rw-r--r--net/psample/Makefile5
-rw-r--r--net/psample/psample.c301
-rw-r--r--net/qrtr/qrtr.c8
-rw-r--r--net/rds/af_rds.c35
-rw-r--r--net/rds/bind.c4
-rw-r--r--net/rds/connection.c29
-rw-r--r--net/rds/ib.c20
-rw-r--r--net/rds/ib.h30
-rw-r--r--net/rds/ib_cm.c136
-rw-r--r--net/rds/ib_frmr.c16
-rw-r--r--net/rds/ib_mr.h3
-rw-r--r--net/rds/ib_recv.c14
-rw-r--r--net/rds/ib_send.c30
-rw-r--r--net/rds/ib_stats.c2
-rw-r--r--net/rds/message.c1
-rw-r--r--net/rds/page.c29
-rw-r--r--net/rds/rdma.c24
-rw-r--r--net/rds/rdma_transport.c16
-rw-r--r--net/rds/rds.h44
-rw-r--r--net/rds/recv.c72
-rw-r--r--net/rds/send.c59
-rw-r--r--net/rds/tcp.c67
-rw-r--r--net/rds/tcp.h2
-rw-r--r--net/rds/tcp_connect.c14
-rw-r--r--net/rds/tcp_listen.c43
-rw-r--r--net/rds/tcp_recv.c5
-rw-r--r--net/rds/tcp_send.c3
-rw-r--r--net/rds/threads.c3
-rw-r--r--net/rds/transport.c4
-rw-r--r--net/rfkill/Kconfig11
-rw-r--r--net/rfkill/Makefile1
-rw-r--r--net/rfkill/core.c100
-rw-r--r--net/rfkill/rfkill-regulator.c154
-rw-r--r--net/rose/af_rose.c7
-rw-r--r--net/rose/rose_route.c2
-rw-r--r--net/rxrpc/Makefile12
-rw-r--r--net/rxrpc/af_rxrpc.c31
-rw-r--r--net/rxrpc/ar-internal.h196
-rw-r--r--net/rxrpc/call_accept.c51
-rw-r--r--net/rxrpc/call_object.c36
-rw-r--r--net/rxrpc/conn_client.c14
-rw-r--r--net/rxrpc/conn_event.c4
-rw-r--r--net/rxrpc/conn_object.c1
-rw-r--r--net/rxrpc/input.c51
-rw-r--r--net/rxrpc/key.c2
-rw-r--r--net/rxrpc/misc.c151
-rw-r--r--net/rxrpc/proc.c9
-rw-r--r--net/rxrpc/recvmsg.c53
-rw-r--r--net/rxrpc/sendmsg.c99
-rw-r--r--net/sched/Kconfig14
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/act_api.c84
-rw-r--r--net/sched/act_bpf.c19
-rw-r--r--net/sched/act_connmark.c5
-rw-r--r--net/sched/act_csum.c32
-rw-r--r--net/sched/act_gact.c2
-rw-r--r--net/sched/act_ife.c119
-rw-r--r--net/sched/act_ipt.c16
-rw-r--r--net/sched/act_mirred.c94
-rw-r--r--net/sched/act_nat.c2
-rw-r--r--net/sched/act_pedit.c222
-rw-r--r--net/sched/act_police.c23
-rw-r--r--net/sched/act_sample.c276
-rw-r--r--net/sched/act_simple.c2
-rw-r--r--net/sched/act_skbedit.c23
-rw-r--r--net/sched/act_skbmod.c3
-rw-r--r--net/sched/act_tunnel_key.c19
-rw-r--r--net/sched/act_vlan.c2
-rw-r--r--net/sched/cls_api.c218
-rw-r--r--net/sched/cls_bpf.c62
-rw-r--r--net/sched/cls_flow.c2
-rw-r--r--net/sched/cls_flower.c409
-rw-r--r--net/sched/cls_matchall.c160
-rw-r--r--net/sched/cls_u32.c11
-rw-r--r--net/sched/em_ipset.c17
-rw-r--r--net/sched/em_meta.c10
-rw-r--r--net/sched/sch_api.c52
-rw-r--r--net/sched/sch_atm.c1
-rw-r--r--net/sched/sch_cbq.c9
-rw-r--r--net/sched/sch_choke.c1
-rw-r--r--net/sched/sch_drr.c6
-rw-r--r--net/sched/sch_dsmark.c11
-rw-r--r--net/sched/sch_fq.c18
-rw-r--r--net/sched/sch_fq_codel.c7
-rw-r--r--net/sched/sch_generic.c6
-rw-r--r--net/sched/sch_hfsc.c6
-rw-r--r--net/sched/sch_hhf.c8
-rw-r--r--net/sched/sch_htb.c7
-rw-r--r--net/sched/sch_ingress.c1
-rw-r--r--net/sched/sch_mq.c10
-rw-r--r--net/sched/sch_mqprio.c19
-rw-r--r--net/sched/sch_multiq.c2
-rw-r--r--net/sched/sch_netem.c6
-rw-r--r--net/sched/sch_prio.c2
-rw-r--r--net/sched/sch_qfq.c8
-rw-r--r--net/sched/sch_sfb.c1
-rw-r--r--net/sched/sch_sfq.c4
-rw-r--r--net/sched/sch_teql.c10
-rw-r--r--net/sctp/Makefile2
-rw-r--r--net/sctp/associola.c59
-rw-r--r--net/sctp/bind_addr.c3
-rw-r--r--net/sctp/chunk.c139
-rw-r--r--net/sctp/debug.c5
-rw-r--r--net/sctp/endpointola.c6
-rw-r--r--net/sctp/input.c130
-rw-r--r--net/sctp/ipv6.c26
-rw-r--r--net/sctp/objcnt.c2
-rw-r--r--net/sctp/offload.c2
-rw-r--r--net/sctp/output.c526
-rw-r--r--net/sctp/outqueue.c55
-rw-r--r--net/sctp/primitive.c3
-rw-r--r--net/sctp/proc.c4
-rw-r--r--net/sctp/protocol.c70
-rw-r--r--net/sctp/sm_make_chunk.c361
-rw-r--r--net/sctp/sm_sideeffect.c38
-rw-r--r--net/sctp/sm_statefuns.c232
-rw-r--r--net/sctp/sm_statetable.c70
-rw-r--r--net/sctp/socket.c238
-rw-r--r--net/sctp/ssnmap.c125
-rw-r--r--net/sctp/stream.c506
-rw-r--r--net/sctp/transport.c58
-rw-r--r--net/sctp/ulpevent.c29
-rw-r--r--net/sctp/ulpqueue.c36
-rw-r--r--net/smc/Kconfig20
-rw-r--r--net/smc/Makefile4
-rw-r--r--net/smc/af_smc.c1409
-rw-r--r--net/smc/smc.h274
-rw-r--r--net/smc/smc_cdc.c304
-rw-r--r--net/smc/smc_cdc.h218
-rw-r--r--net/smc/smc_clc.c282
-rw-r--r--net/smc/smc_clc.h116
-rw-r--r--net/smc/smc_close.c444
-rw-r--r--net/smc/smc_close.h28
-rw-r--r--net/smc/smc_core.c682
-rw-r--r--net/smc/smc_core.h181
-rw-r--r--net/smc/smc_diag.c215
-rw-r--r--net/smc/smc_ib.c466
-rw-r--r--net/smc/smc_ib.h71
-rw-r--r--net/smc/smc_llc.c158
-rw-r--r--net/smc/smc_llc.h63
-rw-r--r--net/smc/smc_pnet.c534
-rw-r--r--net/smc/smc_pnet.h23
-rw-r--r--net/smc/smc_rx.c219
-rw-r--r--net/smc/smc_rx.h23
-rw-r--r--net/smc/smc_tx.c485
-rw-r--r--net/smc/smc_tx.h35
-rw-r--r--net/smc/smc_wr.c614
-rw-r--r--net/smc/smc_wr.h106
-rw-r--r--net/socket.c68
-rw-r--r--net/strparser/strparser.c1
-rw-r--r--net/sunrpc/auth.c16
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c11
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c12
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c3
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c2
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c6
-rw-r--r--net/sunrpc/auth_null.c3
-rw-r--r--net/sunrpc/auth_unix.c18
-rw-r--r--net/sunrpc/cache.c125
-rw-r--r--net/sunrpc/clnt.c58
-rw-r--r--net/sunrpc/debugfs.c35
-rw-r--r--net/sunrpc/netns.h2
-rw-r--r--net/sunrpc/stats.c10
-rw-r--r--net/sunrpc/sunrpc_syms.c3
-rw-r--r--net/sunrpc/svc.c42
-rw-r--r--net/sunrpc/svc_xprt.c16
-rw-r--r--net/sunrpc/svcauth.c18
-rw-r--r--net/sunrpc/svcauth_unix.c4
-rw-r--r--net/sunrpc/svcsock.c32
-rw-r--r--net/sunrpc/sysctl.c2
-rw-r--r--net/sunrpc/xdr.c34
-rw-r--r--net/sunrpc/xprt.c5
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c4
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c5
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c105
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c118
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c23
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c299
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c43
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c138
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c168
-rw-r--r--net/sunrpc/xprtrdma/transport.c40
-rw-r--r--net/sunrpc/xprtrdma/verbs.c134
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h61
-rw-r--r--net/sunrpc/xprtsock.c106
-rw-r--r--net/switchdev/switchdev.c5
-rw-r--r--net/tipc/bcast.c204
-rw-r--r--net/tipc/bcast.h33
-rw-r--r--net/tipc/bearer.c15
-rw-r--r--net/tipc/bearer.h8
-rw-r--r--net/tipc/core.c2
-rw-r--r--net/tipc/core.h2
-rw-r--r--net/tipc/discover.c4
-rw-r--r--net/tipc/link.c89
-rw-r--r--net/tipc/msg.c37
-rw-r--r--net/tipc/msg.h15
-rw-r--r--net/tipc/name_distr.c2
-rw-r--r--net/tipc/name_table.c128
-rw-r--r--net/tipc/name_table.h24
-rw-r--r--net/tipc/net.c4
-rw-r--r--net/tipc/netlink.c27
-rw-r--r--net/tipc/netlink_compat.c25
-rw-r--r--net/tipc/node.c63
-rw-r--r--net/tipc/node.h4
-rw-r--r--net/tipc/server.c48
-rw-r--r--net/tipc/socket.c985
-rw-r--r--net/tipc/subscr.c127
-rw-r--r--net/tipc/subscr.h1
-rw-r--r--net/tipc/udp_media.c8
-rw-r--r--net/unix/af_unix.c87
-rw-r--r--net/unix/garbage.c17
-rw-r--r--net/vmw_vsock/af_vsock.c18
-rw-r--r--net/vmw_vsock/virtio_transport.c103
-rw-r--r--net/vmw_vsock/virtio_transport_common.c35
-rw-r--r--net/vmw_vsock/vmci_transport_notify.c30
-rw-r--r--net/vmw_vsock/vmci_transport_notify_qstate.c30
-rw-r--r--net/wimax/stack.c22
-rw-r--r--net/wireless/Makefile3
-rw-r--r--net/wireless/core.c39
-rw-r--r--net/wireless/core.h13
-rw-r--r--net/wireless/debugfs.c10
-rw-r--r--net/wireless/lib80211_crypt_tkip.c2
-rw-r--r--net/wireless/mesh.c2
-rw-r--r--net/wireless/mlme.c47
-rw-r--r--net/wireless/nl80211.c951
-rw-r--r--net/wireless/nl80211.h10
-rw-r--r--net/wireless/of.c138
-rw-r--r--net/wireless/rdev-ops.h24
-rw-r--r--net/wireless/reg.c27
-rw-r--r--net/wireless/scan.c9
-rw-r--r--net/wireless/sme.c88
-rw-r--r--net/wireless/sysfs.c16
-rw-r--r--net/wireless/trace.h64
-rw-r--r--net/wireless/util.c157
-rw-r--r--net/wireless/wext-core.c67
-rw-r--r--net/wireless/wext-sme.c23
-rw-r--r--net/x25/af_x25.c7
-rw-r--r--net/x25/sysctl_net_x25.c2
-rw-r--r--net/x25/x25_link.c2
-rw-r--r--net/xfrm/Kconfig5
-rw-r--r--net/xfrm/xfrm_input.c111
-rw-r--r--net/xfrm/xfrm_output.c8
-rw-r--r--net/xfrm/xfrm_policy.c173
-rw-r--r--net/xfrm/xfrm_state.c98
-rw-r--r--net/xfrm/xfrm_user.c11
890 files changed, 39173 insertions, 15469 deletions
diff --git a/net/6lowpan/nhc.c b/net/6lowpan/nhc.c
index 7008d53e455c..4fa2fdda174d 100644
--- a/net/6lowpan/nhc.c
+++ b/net/6lowpan/nhc.c
@@ -27,8 +27,8 @@ static int lowpan_nhc_insert(struct lowpan_nhc *nhc)
/* Figure out where to put new node */
while (*new) {
- struct lowpan_nhc *this = container_of(*new, struct lowpan_nhc,
- node);
+ struct lowpan_nhc *this = rb_entry(*new, struct lowpan_nhc,
+ node);
int result, len_dif, len;
len_dif = nhc->idlen - this->idlen;
@@ -69,8 +69,8 @@ static struct lowpan_nhc *lowpan_nhc_by_nhcid(const struct sk_buff *skb)
const u8 *nhcid_skb_ptr = skb->data;
while (node) {
- struct lowpan_nhc *nhc = container_of(node, struct lowpan_nhc,
- node);
+ struct lowpan_nhc *nhc = rb_entry(node, struct lowpan_nhc,
+ node);
u8 nhcid_skb_ptr_masked[LOWPAN_NHC_MAX_ID_LEN];
int result, i;
diff --git a/net/802/fc.c b/net/802/fc.c
index 7b9219022418..1bb496ea997e 100644
--- a/net/802/fc.c
+++ b/net/802/fc.c
@@ -10,7 +10,7 @@
* v 1.0 03/22/99
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
diff --git a/net/802/fddi.c b/net/802/fddi.c
index 7d3a0af954e8..6356623fc238 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -141,15 +141,6 @@ __be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
EXPORT_SYMBOL(fddi_type_trans);
-int fddi_change_mtu(struct net_device *dev, int new_mtu)
-{
- if ((new_mtu < FDDI_K_SNAP_HLEN) || (new_mtu > FDDI_K_SNAP_DLEN))
- return -EINVAL;
- dev->mtu = new_mtu;
- return 0;
-}
-EXPORT_SYMBOL(fddi_change_mtu);
-
static const struct header_ops fddi_header_ops = {
.create = fddi_header,
};
@@ -161,6 +152,8 @@ static void fddi_setup(struct net_device *dev)
dev->type = ARPHRD_FDDI;
dev->hard_header_len = FDDI_K_SNAP_HLEN+3; /* Assume 802.2 SNAP hdr len + 3 pad bytes */
dev->mtu = FDDI_K_SNAP_DLEN; /* Assume max payload of 802.2 SNAP frame */
+ dev->min_mtu = FDDI_K_SNAP_HLEN;
+ dev->max_mtu = FDDI_K_SNAP_DLEN;
dev->addr_len = FDDI_K_ALEN;
dev->tx_queue_len = 100; /* Long queues on FDDI */
dev->flags = IFF_BROADCAST | IFF_MULTICAST;
diff --git a/net/802/hippi.c b/net/802/hippi.c
index ade1a52cdcff..4460606e9c36 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -34,7 +34,7 @@
#include <linux/errno.h>
#include <net/arp.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* Create the HIPPI MAC header for an arbitrary protocol layer
@@ -116,18 +116,6 @@ __be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
EXPORT_SYMBOL(hippi_type_trans);
-int hippi_change_mtu(struct net_device *dev, int new_mtu)
-{
- /*
- * HIPPI's got these nice large MTUs.
- */
- if ((new_mtu < 68) || (new_mtu > 65280))
- return -EINVAL;
- dev->mtu = new_mtu;
- return 0;
-}
-EXPORT_SYMBOL(hippi_change_mtu);
-
/*
* For HIPPI we will actually use the lower 4 bytes of the hardware
* address as the I-FIELD rather than the actual hardware address.
@@ -174,6 +162,8 @@ static void hippi_setup(struct net_device *dev)
dev->type = ARPHRD_HIPPI;
dev->hard_header_len = HIPPI_HLEN;
dev->mtu = 65280;
+ dev->min_mtu = 68;
+ dev->max_mtu = 65280;
dev->addr_len = HIPPI_ALEN;
dev->tx_queue_len = 25 /* 5 */;
memset(dev->broadcast, 0xFF, HIPPI_ALEN);
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index f2531ad66b68..467069b73ce1 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -34,7 +34,7 @@
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/if_vlan.h>
#include "vlan.h"
@@ -44,7 +44,7 @@
/* Global VLAN variables */
-int vlan_net_id __read_mostly;
+unsigned int vlan_net_id __read_mostly;
const char vlan_fullname[] = "802.1Q VLAN Support";
const char vlan_version[] = DRV_VERSION;
@@ -515,8 +515,8 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
return -EFAULT;
/* Null terminate this sucker, just in case. */
- args.device1[23] = 0;
- args.u.device2[23] = 0;
+ args.device1[sizeof(args.device1) - 1] = 0;
+ args.u.device2[sizeof(args.u.device2) - 1] = 0;
rtnl_lock();
@@ -571,8 +571,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
err = -EPERM;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
break;
- if ((args.u.name_type >= 0) &&
- (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) {
+ if (args.u.name_type < VLAN_NAME_TYPE_HIGHEST) {
struct vlan_net *vn;
vn = net_generic(net, vlan_net_id);
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index cc1557978066..df8bd65dd370 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -159,7 +159,7 @@ void vlan_netlink_fini(void);
extern struct rtnl_link_ops vlan_link_ops;
-extern int vlan_net_id;
+extern unsigned int vlan_net_id;
struct proc_dir_entry;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index fbfacd51aa34..e97ab824e368 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -671,7 +671,8 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev,
return 0;
}
-static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+static void vlan_dev_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *stats)
{
struct vlan_pcpu_stats *p;
u32 rx_errors = 0, tx_dropped = 0;
@@ -702,8 +703,6 @@ static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, st
}
stats->rx_errors = rx_errors;
stats->tx_dropped = tx_dropped;
-
- return stats;
}
#ifdef CONFIG_NET_POLL_CONTROLLER
@@ -792,8 +791,6 @@ static const struct net_device_ops vlan_netdev_ops = {
.ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup,
#endif
.ndo_fix_features = vlan_dev_fix_features,
- .ndo_neigh_construct = netdev_default_l2upper_neigh_construct,
- .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy,
.ndo_fdb_add = switchdev_port_fdb_add,
.ndo_fdb_del = switchdev_port_fdb_del,
.ndo_fdb_dump = switchdev_port_fdb_dump,
@@ -826,5 +823,8 @@ void vlan_setup(struct net_device *dev)
dev->destructor = vlan_dev_free;
dev->ethtool_ops = &vlan_ethtool_ops;
+ dev->min_mtu = 0;
+ dev->max_mtu = ETH_MAX_MTU;
+
eth_zero_addr(dev->broadcast);
}
diff --git a/net/9p/client.c b/net/9p/client.c
index 3fc94a49ccd5..3ce672af1596 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -32,7 +32,7 @@
#include <linux/idr.h>
#include <linux/mutex.h>
#include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/uaccess.h>
#include <linux/uio.h>
#include <net/9p/9p.h>
@@ -1101,7 +1101,7 @@ void p9_client_begin_disconnect(struct p9_client *clnt)
EXPORT_SYMBOL(p9_client_begin_disconnect);
struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
- char *uname, kuid_t n_uname, char *aname)
+ const char *uname, kuid_t n_uname, const char *aname)
{
int err = 0;
struct p9_req_t *req;
@@ -1149,7 +1149,7 @@ error:
EXPORT_SYMBOL(p9_client_attach);
struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
- char **wnames, int clone)
+ const unsigned char * const *wnames, int clone)
{
int err;
struct p9_client *clnt;
@@ -1271,7 +1271,7 @@ error:
}
EXPORT_SYMBOL(p9_client_open);
-int p9_client_create_dotl(struct p9_fid *ofid, char *name, u32 flags, u32 mode,
+int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, u32 mode,
kgid_t gid, struct p9_qid *qid)
{
int err = 0;
@@ -1316,7 +1316,7 @@ error:
}
EXPORT_SYMBOL(p9_client_create_dotl);
-int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode,
+int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode,
char *extension)
{
int err;
@@ -1361,8 +1361,8 @@ error:
}
EXPORT_SYMBOL(p9_client_fcreate);
-int p9_client_symlink(struct p9_fid *dfid, char *name, char *symtgt, kgid_t gid,
- struct p9_qid *qid)
+int p9_client_symlink(struct p9_fid *dfid, const char *name,
+ const char *symtgt, kgid_t gid, struct p9_qid *qid)
{
int err = 0;
struct p9_client *clnt;
@@ -1395,7 +1395,7 @@ error:
}
EXPORT_SYMBOL(p9_client_symlink);
-int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, char *newname)
+int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, const char *newname)
{
struct p9_client *clnt;
struct p9_req_t *req;
@@ -2117,7 +2117,7 @@ error:
}
EXPORT_SYMBOL(p9_client_readdir);
-int p9_client_mknod_dotl(struct p9_fid *fid, char *name, int mode,
+int p9_client_mknod_dotl(struct p9_fid *fid, const char *name, int mode,
dev_t rdev, kgid_t gid, struct p9_qid *qid)
{
int err;
@@ -2148,7 +2148,7 @@ error:
}
EXPORT_SYMBOL(p9_client_mknod_dotl);
-int p9_client_mkdir_dotl(struct p9_fid *fid, char *name, int mode,
+int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode,
kgid_t gid, struct p9_qid *qid)
{
int err;
diff --git a/net/Kconfig b/net/Kconfig
index 7b6cd340b72b..102f781a0131 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -57,6 +57,7 @@ source "net/packet/Kconfig"
source "net/unix/Kconfig"
source "net/xfrm/Kconfig"
source "net/iucv/Kconfig"
+source "net/smc/Kconfig"
config INET
bool "TCP/IP networking"
@@ -258,10 +259,6 @@ config XPS
config HWBM
bool
-config SOCK_CGROUP_DATA
- bool
- default n
-
config CGROUP_NET_PRIO
bool "Network priority cgroup"
depends on CGROUPS
@@ -300,7 +297,8 @@ config BPF_JIT
Note, admin should enable this feature changing:
/proc/sys/net/core/bpf_jit_enable
- /proc/sys/net/core/bpf_jit_harden (optional)
+ /proc/sys/net/core/bpf_jit_harden (optional)
+ /proc/sys/net/core/bpf_jit_kallsyms (optional)
config NET_FLOW_LIMIT
bool
@@ -393,6 +391,8 @@ source "net/9p/Kconfig"
source "net/caif/Kconfig"
source "net/ceph/Kconfig"
source "net/nfc/Kconfig"
+source "net/psample/Kconfig"
+source "net/ife/Kconfig"
config LWTUNNEL
bool "Network light weight tunnels"
@@ -402,10 +402,22 @@ config LWTUNNEL
weight tunnel endpoint. Tunnel encapsulation parameters are stored
with light weight tunnel state associated with fib routes.
+config LWTUNNEL_BPF
+ bool "Execute BPF program as route nexthop action"
+ depends on LWTUNNEL
+ default y if LWTUNNEL=y
+ ---help---
+ Allows to run BPF programs as a nexthop action following a route
+ lookup for incoming and outgoing packets.
+
config DST_CACHE
bool
default n
+config GRO_CELLS
+ bool
+ default n
+
config NET_DEVLINK
tristate "Network physical/parent device Netlink interface"
help
diff --git a/net/Makefile b/net/Makefile
index 4cafaa2b4667..9b681550e3a3 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_MAC80211) += mac80211/
obj-$(CONFIG_TIPC) += tipc/
obj-$(CONFIG_NETLABEL) += netlabel/
obj-$(CONFIG_IUCV) += iucv/
+obj-$(CONFIG_SMC) += smc/
obj-$(CONFIG_RFKILL) += rfkill/
obj-$(CONFIG_NET_9P) += 9p/
obj-$(CONFIG_CAIF) += caif/
@@ -69,6 +70,8 @@ obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/
obj-$(CONFIG_CEPH_LIB) += ceph/
obj-$(CONFIG_BATMAN_ADV) += batman-adv/
obj-$(CONFIG_NFC) += nfc/
+obj-$(CONFIG_PSAMPLE) += psample/
+obj-$(CONFIG_NET_IFE) += ife/
obj-$(CONFIG_OPENVSWITCH) += openvswitch/
obj-$(CONFIG_VSOCKETS) += vmw_vsock/
obj-$(CONFIG_MPLS) += mpls/
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 10d2bdce686e..465cc24b41e5 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1656,7 +1656,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
ddp->deh_dport = usat->sat_port;
ddp->deh_sport = at->src_port;
- SOCK_DEBUG(sk, "SK %p: Copy user data (%Zd bytes).\n", sk, len);
+ SOCK_DEBUG(sk, "SK %p: Copy user data (%zd bytes).\n", sk, len);
err = memcpy_from_msg(skb_put(skb, len), msg, len);
if (err) {
@@ -1720,7 +1720,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
*/
aarp_send_ddp(dev, skb, &usat->sat_addr, NULL);
}
- SOCK_DEBUG(sk, "SK %p: Done write (%Zd).\n", sk, len);
+ SOCK_DEBUG(sk, "SK %p: Done write (%zd).\n", sk, len);
out:
release_sock(sk);
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index aa0047c5c467..fca84e111c89 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -620,14 +620,12 @@ error:
static const struct net_device_ops br2684_netdev_ops = {
.ndo_start_xmit = br2684_start_xmit,
.ndo_set_mac_address = br2684_mac_addr,
- .ndo_change_mtu = eth_change_mtu,
.ndo_validate_addr = eth_validate_addr,
};
static const struct net_device_ops br2684_netdev_ops_routed = {
.ndo_start_xmit = br2684_start_xmit,
.ndo_set_mac_address = br2684_mac_addr,
- .ndo_change_mtu = eth_change_mtu
};
static void br2684_setup(struct net_device *netdev)
@@ -651,7 +649,9 @@ static void br2684_setup_routed(struct net_device *netdev)
netdev->hard_header_len = sizeof(llc_oui_ipv4); /* worst case */
netdev->netdev_ops = &br2684_netdev_ops_routed;
netdev->addr_len = 0;
- netdev->mtu = 1500;
+ netdev->mtu = ETH_DATA_LEN;
+ netdev->min_mtu = 0;
+ netdev->max_mtu = ETH_MAX_MTU;
netdev->type = ARPHRD_PPP;
netdev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
netdev->tx_queue_len = 100;
diff --git a/net/atm/common.c b/net/atm/common.c
index 6dc12305799e..9613381f5db0 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -13,7 +13,7 @@
#include <linux/errno.h> /* error codes */
#include <linux/capability.h>
#include <linux/mm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/time.h> /* struct timeval */
#include <linux/skbuff.h>
#include <linux/bitops.h>
@@ -630,7 +630,7 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size)
goto out;
skb->dev = NULL; /* for paths shared with net_device interfaces */
ATM_SKB(skb)->atm_options = vcc->atm_options;
- if (copy_from_iter(skb_put(skb, size), size, &m->msg_iter) != size) {
+ if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) {
kfree_skb(skb);
error = -EFAULT;
goto out;
diff --git a/net/atm/lec.c b/net/atm/lec.c
index 5d2693826afb..09cfe87f0a44 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -111,9 +111,9 @@ static inline void lec_arp_put(struct lec_arp_table *entry)
}
static struct lane2_ops lane2_ops = {
- lane2_resolve, /* resolve, spec 3.1.3 */
- lane2_associate_req, /* associate_req, spec 3.1.4 */
- NULL /* associate indicator, spec 3.1.5 */
+ .resolve = lane2_resolve, /* spec 3.1.3 */
+ .associate_req = lane2_associate_req, /* spec 3.1.4 */
+ .associate_indicator = NULL /* spec 3.1.5 */
};
static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
@@ -544,15 +544,6 @@ send_to_lecd(struct lec_priv *priv, atmlec_msg_type type,
return 0;
}
-/* shamelessly stolen from drivers/net/net_init.c */
-static int lec_change_mtu(struct net_device *dev, int new_mtu)
-{
- if ((new_mtu < 68) || (new_mtu > 18190))
- return -EINVAL;
- dev->mtu = new_mtu;
- return 0;
-}
-
static void lec_set_multicast_list(struct net_device *dev)
{
/*
@@ -565,7 +556,6 @@ static const struct net_device_ops lec_netdev_ops = {
.ndo_open = lec_open,
.ndo_stop = lec_close,
.ndo_start_xmit = lec_start_xmit,
- .ndo_change_mtu = lec_change_mtu,
.ndo_tx_timeout = lec_tx_timeout,
.ndo_set_rx_mode = lec_set_multicast_list,
};
@@ -742,6 +732,7 @@ static int lecd_attach(struct atm_vcc *vcc, int arg)
if (!dev_lec[i])
return -ENOMEM;
dev_lec[i]->netdev_ops = &lec_netdev_ops;
+ dev_lec[i]->max_mtu = 18190;
snprintf(dev_lec[i]->name, IFNAMSIZ, "lec%d", i);
if (register_netdev(dev_lec[i])) {
free_netdev(dev_lec[i]);
@@ -1068,7 +1059,9 @@ static void __exit lane_module_cleanup(void)
{
int i;
+#ifdef CONFIG_PROC_FS
remove_proc_entry("lec", atm_proc_root);
+#endif
deregister_atm_ioctl(&lane_ioctl_ops);
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 3b3b1a292ec8..a190800572bd 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -451,7 +451,7 @@ static void lane2_assoc_ind(struct net_device *dev, const u8 *mac_addr,
return;
}
if (end_of_tlvs - tlvs != 0)
- pr_info("(%s) ignoring %Zd bytes of trailing TLV garbage\n",
+ pr_info("(%s) ignoring %zd bytes of trailing TLV garbage\n",
dev->name, end_of_tlvs - tlvs);
}
diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c
index 9e60e74c807d..a89fdebeffda 100644
--- a/net/atm/mpoa_caches.c
+++ b/net/atm/mpoa_caches.c
@@ -535,33 +535,32 @@ static void eg_destroy_cache(struct mpoa_client *mpc)
static const struct in_cache_ops ingress_ops = {
- in_cache_add_entry, /* add_entry */
- in_cache_get, /* get */
- in_cache_get_with_mask, /* get_with_mask */
- in_cache_get_by_vcc, /* get_by_vcc */
- in_cache_put, /* put */
- in_cache_remove_entry, /* remove_entry */
- cache_hit, /* cache_hit */
- clear_count_and_expired, /* clear_count */
- check_resolving_entries, /* check_resolving */
- refresh_entries, /* refresh */
- in_destroy_cache /* destroy_cache */
+ .add_entry = in_cache_add_entry,
+ .get = in_cache_get,
+ .get_with_mask = in_cache_get_with_mask,
+ .get_by_vcc = in_cache_get_by_vcc,
+ .put = in_cache_put,
+ .remove_entry = in_cache_remove_entry,
+ .cache_hit = cache_hit,
+ .clear_count = clear_count_and_expired,
+ .check_resolving = check_resolving_entries,
+ .refresh = refresh_entries,
+ .destroy_cache = in_destroy_cache
};
static const struct eg_cache_ops egress_ops = {
- eg_cache_add_entry, /* add_entry */
- eg_cache_get_by_cache_id, /* get_by_cache_id */
- eg_cache_get_by_tag, /* get_by_tag */
- eg_cache_get_by_vcc, /* get_by_vcc */
- eg_cache_get_by_src_ip, /* get_by_src_ip */
- eg_cache_put, /* put */
- eg_cache_remove_entry, /* remove_entry */
- update_eg_cache_entry, /* update */
- clear_expired, /* clear_expired */
- eg_destroy_cache /* destroy_cache */
+ .add_entry = eg_cache_add_entry,
+ .get_by_cache_id = eg_cache_get_by_cache_id,
+ .get_by_tag = eg_cache_get_by_tag,
+ .get_by_vcc = eg_cache_get_by_vcc,
+ .get_by_src_ip = eg_cache_get_by_src_ip,
+ .put = eg_cache_put,
+ .remove_entry = eg_cache_remove_entry,
+ .update = update_eg_cache_entry,
+ .clear_expired = clear_expired,
+ .destroy_cache = eg_destroy_cache
};
-
void atm_mpoa_init_cache(struct mpoa_client *mpc)
{
mpc->in_ops = &ingress_ops;
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 878563a8354d..5589de7086af 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -10,7 +10,7 @@
#include <linux/kernel.h> /* printk */
#include <linux/skbuff.h>
#include <linux/wait.h>
-#include <linux/sched.h> /* jiffies and HZ */
+#include <linux/sched/signal.h>
#include <linux/fcntl.h> /* O_NONBLOCK */
#include <linux/init.h>
#include <linux/atm.h> /* ATM stuff */
@@ -318,7 +318,8 @@ out:
return error;
}
-static int svc_accept(struct socket *sock, struct socket *newsock, int flags)
+static int svc_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern)
{
struct sock *sk = sock->sk;
struct sk_buff *skb;
@@ -329,7 +330,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags)
lock_sock(sk);
- error = svc_create(sock_net(sk), newsock, 0, 0);
+ error = svc_create(sock_net(sk), newsock, 0, kern);
if (error)
goto out;
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 2fdebabbfacd..b7c486752b3a 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -20,7 +20,7 @@
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
@@ -32,7 +32,7 @@
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/termios.h> /* For TIOCINQ/OUTQ */
#include <linux/mm.h>
@@ -1320,7 +1320,8 @@ out_release:
return err;
}
-static int ax25_accept(struct socket *sock, struct socket *newsock, int flags)
+static int ax25_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern)
{
struct sk_buff *skb;
struct sock *newsk;
diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c
index e7c9b0ea17a1..ac2542b7be88 100644
--- a/net/ax25/ax25_addr.c
+++ b/net/ax25/ax25_addr.c
@@ -21,7 +21,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c
index 3d106767b272..9a3a301e1e2f 100644
--- a/net/ax25/ax25_dev.c
+++ b/net/ax25/ax25_dev.c
@@ -23,7 +23,7 @@
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c
index 9bd31e88aeca..891596e74278 100644
--- a/net/ax25/ax25_ds_in.c
+++ b/net/ax25/ax25_ds_in.c
@@ -22,7 +22,7 @@
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c
index e05bd57b5afd..28827e81ba2b 100644
--- a/net/ax25/ax25_ds_subr.c
+++ b/net/ax25/ax25_ds_subr.c
@@ -23,7 +23,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
index 5237dff6941d..5fb2104b7304 100644
--- a/net/ax25/ax25_ds_timer.c
+++ b/net/ax25/ax25_ds_timer.c
@@ -24,7 +24,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_iface.c b/net/ax25/ax25_iface.c
index 7f16e8a931b2..8c07c28569e4 100644
--- a/net/ax25/ax25_iface.c
+++ b/net/ax25/ax25_iface.c
@@ -23,7 +23,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
index bb5a0e4e98d9..860752639b1a 100644
--- a/net/ax25/ax25_in.c
+++ b/net/ax25/ax25_in.c
@@ -25,7 +25,7 @@
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
index 2fa3be965101..183b1c583d56 100644
--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -23,7 +23,7 @@
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/termios.h> /* For TIOCINQ/OUTQ */
#include <linux/mm.h>
diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c
index 8ddd41baa81c..b11a5f466fcc 100644
--- a/net/ax25/ax25_out.c
+++ b/net/ax25/ax25_out.c
@@ -25,7 +25,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index d39097737e38..e1fda27cb27c 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -31,7 +31,7 @@
#include <linux/skbuff.h>
#include <linux/spinlock.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c
index 3fbf8f7b2cf4..8632b86e843e 100644
--- a/net/ax25/ax25_std_in.c
+++ b/net/ax25/ax25_std_in.c
@@ -29,7 +29,7 @@
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_std_subr.c b/net/ax25/ax25_std_subr.c
index 8b66a41e538f..94bd06396a43 100644
--- a/net/ax25/ax25_std_subr.c
+++ b/net/ax25/ax25_std_subr.c
@@ -20,7 +20,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c
index 2c0d6ef66f9d..30bbc675261d 100644
--- a/net/ax25/ax25_std_timer.c
+++ b/net/ax25/ax25_std_timer.c
@@ -24,7 +24,7 @@
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
index 655a7d4c96e1..038b109b2be7 100644
--- a/net/ax25/ax25_subr.c
+++ b/net/ax25/ax25_subr.c
@@ -25,7 +25,7 @@
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
@@ -264,7 +264,7 @@ void ax25_disconnect(ax25_cb *ax25, int reason)
{
ax25_clear_queues(ax25);
- if (!sock_flag(ax25->sk, SOCK_DESTROY))
+ if (!ax25->sk || !sock_flag(ax25->sk, SOCK_DESTROY))
ax25_stop_heartbeat(ax25);
ax25_stop_t1timer(ax25);
ax25_stop_t2timer(ax25);
diff --git a/net/ax25/ax25_timer.c b/net/ax25/ax25_timer.c
index c3cffa79bafb..23a6f38a80bf 100644
--- a/net/ax25/ax25_timer.c
+++ b/net/ax25/ax25_timer.c
@@ -28,7 +28,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c
index 4ad2fb7bcd35..0403b0def7e6 100644
--- a/net/ax25/ax25_uid.c
+++ b/net/ax25/ax25_uid.c
@@ -25,7 +25,7 @@
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index f20742cbae6d..b73b96a2854b 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -17,7 +17,7 @@ config BATMAN_ADV
config BATMAN_ADV_BATMAN_V
bool "B.A.T.M.A.N. V protocol (experimental)"
- depends on BATMAN_ADV && CFG80211=y || (CFG80211=m && BATMAN_ADV=m)
+ depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y)
default n
help
This option enables the B.A.T.M.A.N. V protocol, the successor
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index f724d3c98a81..915987bc6d29 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+# Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
#
# Marek Lindner, Simon Wunderlich
#
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
index 623d04302aa2..44fd073b7546 100644
--- a/net/batman-adv/bat_algo.c
+++ b/net/batman-adv/bat_algo.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 3b5b69cdd12b..29f6312f9bf1 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Linus Lüssing
*
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index e2d18d0b1f06..71343d0fec94 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -698,7 +698,7 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
forw_packet_aggr->skb = netdev_alloc_skb_ip_align(NULL, skb_size);
if (!forw_packet_aggr->skb) {
- batadv_forw_packet_free(forw_packet_aggr);
+ batadv_forw_packet_free(forw_packet_aggr, true);
return;
}
@@ -717,17 +717,10 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
if (direct_link)
forw_packet_aggr->direct_link_flags |= 1;
- /* add new packet to packet list */
- spin_lock_bh(&bat_priv->forw_bat_list_lock);
- hlist_add_head(&forw_packet_aggr->list, &bat_priv->forw_bat_list);
- spin_unlock_bh(&bat_priv->forw_bat_list_lock);
-
- /* start timer for this packet */
INIT_DELAYED_WORK(&forw_packet_aggr->delayed_work,
batadv_iv_send_outstanding_bat_ogm_packet);
- queue_delayed_work(batadv_event_workqueue,
- &forw_packet_aggr->delayed_work,
- send_time - jiffies);
+
+ batadv_forw_packet_ogmv1_queue(bat_priv, forw_packet_aggr, send_time);
}
/* aggregate a new packet into the existing ogm packet */
@@ -1272,7 +1265,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
*/
tq_iface_penalty = BATADV_TQ_MAX_VALUE;
if (if_outgoing && (if_incoming == if_outgoing) &&
- batadv_is_wifi_netdev(if_outgoing->net_dev))
+ batadv_is_wifi_hardif(if_outgoing))
tq_iface_penalty = batadv_hop_penalty(BATADV_TQ_MAX_VALUE,
bat_priv);
@@ -1611,7 +1604,7 @@ out:
if (hardif_neigh)
batadv_hardif_neigh_put(hardif_neigh);
- kfree_skb(skb_priv);
+ consume_skb(skb_priv);
}
/**
@@ -1783,17 +1776,17 @@ static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work)
struct delayed_work *delayed_work;
struct batadv_forw_packet *forw_packet;
struct batadv_priv *bat_priv;
+ bool dropped = false;
delayed_work = to_delayed_work(work);
forw_packet = container_of(delayed_work, struct batadv_forw_packet,
delayed_work);
bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface);
- spin_lock_bh(&bat_priv->forw_bat_list_lock);
- hlist_del(&forw_packet->list);
- spin_unlock_bh(&bat_priv->forw_bat_list_lock);
- if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) {
+ dropped = true;
goto out;
+ }
batadv_iv_ogm_emit(forw_packet);
@@ -1810,7 +1803,10 @@ static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work)
batadv_iv_ogm_schedule(forw_packet->if_incoming);
out:
- batadv_forw_packet_free(forw_packet);
+ /* do we get something for free()? */
+ if (batadv_forw_packet_steal(forw_packet,
+ &bat_priv->forw_bat_list_lock))
+ batadv_forw_packet_free(forw_packet, dropped);
}
static int batadv_iv_ogm_receive(struct sk_buff *skb,
@@ -1820,17 +1816,18 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
struct batadv_ogm_packet *ogm_packet;
u8 *packet_pos;
int ogm_offset;
- bool ret;
+ bool res;
+ int ret = NET_RX_DROP;
- ret = batadv_check_management_packet(skb, if_incoming, BATADV_OGM_HLEN);
- if (!ret)
- return NET_RX_DROP;
+ res = batadv_check_management_packet(skb, if_incoming, BATADV_OGM_HLEN);
+ if (!res)
+ goto free_skb;
/* did we receive a B.A.T.M.A.N. IV OGM packet on an interface
* that does not have B.A.T.M.A.N. IV enabled ?
*/
if (bat_priv->algo_ops->iface.enable != batadv_iv_ogm_iface_enable)
- return NET_RX_DROP;
+ goto free_skb;
batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX);
batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES,
@@ -1851,8 +1848,15 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
ogm_packet = (struct batadv_ogm_packet *)packet_pos;
}
- kfree_skb(skb);
- return NET_RX_SUCCESS;
+ ret = NET_RX_SUCCESS;
+
+free_skb:
+ if (ret == NET_RX_SUCCESS)
+ consume_skb(skb);
+ else
+ kfree_skb(skb);
+
+ return ret;
}
#ifdef CONFIG_BATMAN_ADV_DEBUGFS
@@ -2473,6 +2477,16 @@ static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface)
batadv_iv_ogm_schedule(hard_iface);
}
+/**
+ * batadv_iv_init_sel_class - initialize GW selection class
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_iv_init_sel_class(struct batadv_priv *bat_priv)
+{
+ /* set default TQ difference threshold to 20 */
+ atomic_set(&bat_priv->gw.sel_class, 20);
+}
+
static struct batadv_gw_node *
batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
{
@@ -2486,7 +2500,7 @@ batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
struct batadv_orig_node *orig_node;
rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
+ hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
orig_node = gw_node->orig_node;
router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
if (!router)
@@ -2674,7 +2688,7 @@ static void batadv_iv_gw_print(struct batadv_priv *bat_priv,
" Gateway (#/255) Nexthop [outgoingIF]: advertised uplink bandwidth\n");
rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
+ hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
/* fails if orig_node has no router */
if (batadv_iv_gw_write_buffer_text(bat_priv, seq, gw_node) < 0)
continue;
@@ -2774,7 +2788,7 @@ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
int idx = 0;
rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
+ hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
if (idx++ < idx_skip)
continue;
@@ -2819,6 +2833,7 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
.del_if = batadv_iv_ogm_orig_del_if,
},
.gw = {
+ .init_sel_class = batadv_iv_init_sel_class,
.get_best_gw_node = batadv_iv_gw_get_best_gw_node,
.is_eligible = batadv_iv_gw_is_eligible,
#ifdef CONFIG_BATMAN_ADV_DEBUGFS
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
index b9f3550faaf7..ae2ab526bdb1 100644
--- a/net/batman-adv/bat_iv_ogm.h
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index e79f6f01182e..a36c8e7291d6 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
*
* Linus Lüssing, Marek Lindner
*
@@ -668,6 +668,16 @@ err_ifinfo1:
return ret;
}
+/**
+ * batadv_v_init_sel_class - initialize GW selection class
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_v_init_sel_class(struct batadv_priv *bat_priv)
+{
+ /* set default throughput difference threshold to 5Mbps */
+ atomic_set(&bat_priv->gw.sel_class, 50);
+}
+
static ssize_t batadv_v_store_sel_class(struct batadv_priv *bat_priv,
char *buff, size_t count)
{
@@ -750,7 +760,7 @@ batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv)
u32 max_bw = 0, bw;
rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
+ hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
if (!kref_get_unless_zero(&gw_node->refcount))
continue;
@@ -787,7 +797,7 @@ static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv,
struct batadv_orig_node *curr_gw_orig,
struct batadv_orig_node *orig_node)
{
- struct batadv_gw_node *curr_gw = NULL, *orig_gw = NULL;
+ struct batadv_gw_node *curr_gw, *orig_gw = NULL;
u32 gw_throughput, orig_throughput, threshold;
bool ret = false;
@@ -889,7 +899,7 @@ static void batadv_v_gw_print(struct batadv_priv *bat_priv,
" Gateway ( throughput) Nexthop [outgoingIF]: advertised uplink bandwidth\n");
rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
+ hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
/* fails if orig_node has no router */
if (batadv_v_gw_write_buffer_text(bat_priv, seq, gw_node) < 0)
continue;
@@ -1009,7 +1019,7 @@ static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
int idx = 0;
rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
+ hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
if (idx++ < idx_skip)
continue;
@@ -1052,6 +1062,7 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = {
.dump = batadv_v_orig_dump,
},
.gw = {
+ .init_sel_class = batadv_v_init_sel_class,
.store_sel_class = batadv_v_store_sel_class,
.show_sel_class = batadv_v_show_sel_class,
.get_best_gw_node = batadv_v_gw_get_best_gw_node,
@@ -1092,9 +1103,6 @@ int batadv_v_mesh_init(struct batadv_priv *bat_priv)
if (ret < 0)
return ret;
- /* set default throughput difference threshold to 5Mbps */
- atomic_set(&bat_priv->gw.sel_class, 50);
-
return 0;
}
diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h
index 83b77639729e..dd7c4b647e6b 100644
--- a/net/batman-adv/bat_v.h
+++ b/net/batman-adv/bat_v.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Linus Lüssing
*
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index ee08540ce503..b90c9903e246 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
*
* Linus Lüssing, Marek Lindner
*
@@ -75,6 +75,7 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
{
struct batadv_hard_iface *hard_iface = neigh->if_incoming;
struct ethtool_link_ksettings link_settings;
+ struct net_device *real_netdev;
struct station_info sinfo;
u32 throughput;
int ret;
@@ -89,23 +90,27 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
/* if this is a wireless device, then ask its throughput through
* cfg80211 API
*/
- if (batadv_is_wifi_netdev(hard_iface->net_dev)) {
- if (hard_iface->net_dev->ieee80211_ptr) {
- ret = cfg80211_get_station(hard_iface->net_dev,
- neigh->addr, &sinfo);
- if (ret == -ENOENT) {
- /* Node is not associated anymore! It would be
- * possible to delete this neighbor. For now set
- * the throughput metric to 0.
- */
- return 0;
- }
- if (!ret)
- return sinfo.expected_throughput / 100;
+ if (batadv_is_wifi_hardif(hard_iface)) {
+ if (!batadv_is_cfg80211_hardif(hard_iface))
+ /* unsupported WiFi driver version */
+ goto default_throughput;
+
+ real_netdev = batadv_get_real_netdev(hard_iface->net_dev);
+ if (!real_netdev)
+ goto default_throughput;
+
+ ret = cfg80211_get_station(real_netdev, neigh->addr, &sinfo);
+
+ dev_put(real_netdev);
+ if (ret == -ENOENT) {
+ /* Node is not associated anymore! It would be
+ * possible to delete this neighbor. For now set
+ * the throughput metric to 0.
+ */
+ return 0;
}
-
- /* unsupported WiFi driver version */
- goto default_throughput;
+ if (!ret)
+ return sinfo.expected_throughput / 100;
}
/* if not a wifi interface, check if this device provides data via
@@ -187,7 +192,7 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh)
int elp_skb_len;
/* this probing routine is for Wifi neighbours only */
- if (!batadv_is_wifi_netdev(hard_iface->net_dev))
+ if (!batadv_is_wifi_hardif(hard_iface))
return true;
/* probe the neighbor only if no unicast packets have been sent
@@ -352,7 +357,7 @@ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface)
/* warn the user (again) if there is no throughput data is available */
hard_iface->bat_v.flags &= ~BATADV_WARNING_DEFAULT;
- if (batadv_is_wifi_netdev(hard_iface->net_dev))
+ if (batadv_is_wifi_hardif(hard_iface))
hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX;
INIT_DELAYED_WORK(&hard_iface->bat_v.elp_wq,
@@ -492,20 +497,21 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb,
struct batadv_elp_packet *elp_packet;
struct batadv_hard_iface *primary_if;
struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb);
- bool ret;
+ bool res;
+ int ret = NET_RX_DROP;
- ret = batadv_check_management_packet(skb, if_incoming, BATADV_ELP_HLEN);
- if (!ret)
- return NET_RX_DROP;
+ res = batadv_check_management_packet(skb, if_incoming, BATADV_ELP_HLEN);
+ if (!res)
+ goto free_skb;
if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
- return NET_RX_DROP;
+ goto free_skb;
/* did we receive a B.A.T.M.A.N. V ELP packet on an interface
* that does not have B.A.T.M.A.N. V ELP enabled ?
*/
if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0)
- return NET_RX_DROP;
+ goto free_skb;
elp_packet = (struct batadv_elp_packet *)skb->data;
@@ -516,14 +522,19 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb,
primary_if = batadv_primary_if_get_selected(bat_priv);
if (!primary_if)
- goto out;
+ goto free_skb;
batadv_v_elp_neigh_update(bat_priv, ethhdr->h_source, if_incoming,
elp_packet);
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- consume_skb(skb);
- return NET_RX_SUCCESS;
+ ret = NET_RX_SUCCESS;
+ batadv_hardif_put(primary_if);
+
+free_skb:
+ if (ret == NET_RX_SUCCESS)
+ consume_skb(skb);
+ else
+ kfree_skb(skb);
+
+ return ret;
}
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index be17c0b1369e..376ead280ab9 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
*
* Linus Lüssing, Marek Lindner
*
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 1aeeadca620c..03a35c9f456d 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
*
* Antonio Quartulli
*
@@ -140,6 +140,7 @@ static void batadv_v_ogm_send(struct work_struct *work)
unsigned char *ogm_buff, *pkt_buff;
int ogm_buff_len;
u16 tvlv_len = 0;
+ int ret;
bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work);
bat_priv = container_of(bat_v, struct batadv_priv, bat_v);
@@ -182,6 +183,31 @@ static void batadv_v_ogm_send(struct work_struct *work)
if (!kref_get_unless_zero(&hard_iface->refcount))
continue;
+ ret = batadv_hardif_no_broadcast(hard_iface, NULL, NULL);
+ if (ret) {
+ char *type;
+
+ switch (ret) {
+ case BATADV_HARDIF_BCAST_NORECIPIENT:
+ type = "no neighbor";
+ break;
+ case BATADV_HARDIF_BCAST_DUPFWD:
+ type = "single neighbor is source";
+ break;
+ case BATADV_HARDIF_BCAST_DUPORIG:
+ type = "single neighbor is originator";
+ break;
+ default:
+ type = "unknown";
+ }
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "OGM2 from ourselve on %s surpressed: %s\n",
+ hard_iface->net_dev->name, type);
+
+ batadv_hardif_put(hard_iface);
+ continue;
+ }
+
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"Sending own OGM2 packet (originator %pM, seqno %u, throughput %u, TTL %d) on interface %s [%pM]\n",
ogm_packet->orig, ntohl(ogm_packet->seqno),
@@ -401,7 +427,7 @@ static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv,
struct batadv_hard_iface *if_incoming,
struct batadv_hard_iface *if_outgoing)
{
- struct batadv_orig_ifinfo *orig_ifinfo = NULL;
+ struct batadv_orig_ifinfo *orig_ifinfo;
struct batadv_neigh_ifinfo *neigh_ifinfo = NULL;
bool protection_started = false;
int ret = -EINVAL;
@@ -486,7 +512,7 @@ static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv,
struct batadv_hard_iface *if_outgoing)
{
struct batadv_neigh_node *router = NULL;
- struct batadv_orig_node *orig_neigh_node = NULL;
+ struct batadv_orig_node *orig_neigh_node;
struct batadv_neigh_node *orig_neigh_router = NULL;
struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL;
u32 router_throughput, neigh_throughput;
@@ -651,6 +677,7 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
struct batadv_hard_iface *hard_iface;
struct batadv_ogm2_packet *ogm_packet;
u32 ogm_throughput, link_throughput, path_throughput;
+ int ret;
ethhdr = eth_hdr(skb);
ogm_packet = (struct batadv_ogm2_packet *)(skb->data + ogm_offset);
@@ -716,6 +743,35 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
if (!kref_get_unless_zero(&hard_iface->refcount))
continue;
+ ret = batadv_hardif_no_broadcast(hard_iface,
+ ogm_packet->orig,
+ hardif_neigh->orig);
+
+ if (ret) {
+ char *type;
+
+ switch (ret) {
+ case BATADV_HARDIF_BCAST_NORECIPIENT:
+ type = "no neighbor";
+ break;
+ case BATADV_HARDIF_BCAST_DUPFWD:
+ type = "single neighbor is source";
+ break;
+ case BATADV_HARDIF_BCAST_DUPORIG:
+ type = "single neighbor is originator";
+ break;
+ default:
+ type = "unknown";
+ }
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "OGM2 packet from %pM on %s surpressed: %s\n",
+ ogm_packet->orig, hard_iface->net_dev->name,
+ type);
+
+ batadv_hardif_put(hard_iface);
+ continue;
+ }
+
batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet,
orig_node, neigh_node,
if_incoming, hard_iface);
@@ -754,18 +810,18 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb,
* B.A.T.M.A.N. V enabled ?
*/
if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0)
- return NET_RX_DROP;
+ goto free_skb;
if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN))
- return NET_RX_DROP;
+ goto free_skb;
if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
- return NET_RX_DROP;
+ goto free_skb;
ogm_packet = (struct batadv_ogm2_packet *)skb->data;
if (batadv_is_my_mac(bat_priv, ogm_packet->orig))
- return NET_RX_DROP;
+ goto free_skb;
batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX);
batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES,
@@ -786,7 +842,12 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb,
}
ret = NET_RX_SUCCESS;
- consume_skb(skb);
+
+free_skb:
+ if (ret == NET_RX_SUCCESS)
+ consume_skb(skb);
+ else
+ kfree_skb(skb);
return ret;
}
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index 4c4d45caa422..2068770b542d 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
*
* Antonio Quartulli
*
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index 032271421a20..2b070c7e31da 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index 0e6e9d09078c..cc262c9d97e0 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index e7f690b571ea..ba8420d8a992 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich
*
@@ -449,7 +449,6 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac,
batadv_inc_counter(bat_priv, BATADV_CNT_RX);
batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
skb->len + ETH_HLEN);
- soft_iface->last_rx = jiffies;
netif_rx(skb);
out:
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 1ae93e46fb98..e157986bd01c 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich
*
@@ -20,6 +20,8 @@
#include "main.h"
+#include <linux/compiler.h>
+#include <linux/stddef.h>
#include <linux/types.h>
struct net_device;
@@ -27,6 +29,22 @@ struct netlink_callback;
struct seq_file;
struct sk_buff;
+/**
+ * batadv_bla_is_loopdetect_mac - check if the mac address is from a loop detect
+ * frame sent by bridge loop avoidance
+ * @mac: mac address to check
+ *
+ * Return: true if the it looks like a loop detect frame
+ * (mac starts with BA:BE), false otherwise
+ */
+static inline bool batadv_bla_is_loopdetect_mac(const uint8_t *mac)
+{
+ if (mac[0] == 0xba && mac[1] == 0xbe)
+ return true;
+
+ return false;
+}
+
#ifdef CONFIG_BATMAN_ADV_BLA
bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
unsigned short vid, bool is_bcast);
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index b4ffba7dd583..e32ad47c6efd 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -19,7 +19,7 @@
#include "main.h"
#include <linux/debugfs.h>
-#include <linux/device.h>
+#include <linux/err.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/fs.h>
@@ -186,7 +186,7 @@ struct batadv_debuginfo batadv_debuginfo_##_name = { \
/* the following attributes are general and therefore they will be directly
* placed in the BATADV_DEBUGFS_SUBDIR subdirectory of debugfs
*/
-static BATADV_DEBUGINFO(routing_algos, S_IRUGO, batadv_algorithms_open);
+static BATADV_DEBUGINFO(routing_algos, 0444, batadv_algorithms_open);
static struct batadv_debuginfo *batadv_general_debuginfos[] = {
&batadv_debuginfo_routing_algos,
@@ -194,26 +194,24 @@ static struct batadv_debuginfo *batadv_general_debuginfos[] = {
};
/* The following attributes are per soft interface */
-static BATADV_DEBUGINFO(neighbors, S_IRUGO, neighbors_open);
-static BATADV_DEBUGINFO(originators, S_IRUGO, batadv_originators_open);
-static BATADV_DEBUGINFO(gateways, S_IRUGO, batadv_gateways_open);
-static BATADV_DEBUGINFO(transtable_global, S_IRUGO,
- batadv_transtable_global_open);
+static BATADV_DEBUGINFO(neighbors, 0444, neighbors_open);
+static BATADV_DEBUGINFO(originators, 0444, batadv_originators_open);
+static BATADV_DEBUGINFO(gateways, 0444, batadv_gateways_open);
+static BATADV_DEBUGINFO(transtable_global, 0444, batadv_transtable_global_open);
#ifdef CONFIG_BATMAN_ADV_BLA
-static BATADV_DEBUGINFO(bla_claim_table, S_IRUGO, batadv_bla_claim_table_open);
-static BATADV_DEBUGINFO(bla_backbone_table, S_IRUGO,
+static BATADV_DEBUGINFO(bla_claim_table, 0444, batadv_bla_claim_table_open);
+static BATADV_DEBUGINFO(bla_backbone_table, 0444,
batadv_bla_backbone_table_open);
#endif
#ifdef CONFIG_BATMAN_ADV_DAT
-static BATADV_DEBUGINFO(dat_cache, S_IRUGO, batadv_dat_cache_open);
+static BATADV_DEBUGINFO(dat_cache, 0444, batadv_dat_cache_open);
#endif
-static BATADV_DEBUGINFO(transtable_local, S_IRUGO,
- batadv_transtable_local_open);
+static BATADV_DEBUGINFO(transtable_local, 0444, batadv_transtable_local_open);
#ifdef CONFIG_BATMAN_ADV_NC
-static BATADV_DEBUGINFO(nc_nodes, S_IRUGO, batadv_nc_nodes_open);
+static BATADV_DEBUGINFO(nc_nodes, 0444, batadv_nc_nodes_open);
#endif
#ifdef CONFIG_BATMAN_ADV_MCAST
-static BATADV_DEBUGINFO(mcast_flags, S_IRUGO, batadv_mcast_flags_open);
+static BATADV_DEBUGINFO(mcast_flags, 0444, batadv_mcast_flags_open);
#endif
static struct batadv_debuginfo *batadv_mesh_debuginfos[] = {
@@ -253,7 +251,7 @@ struct batadv_debuginfo batadv_hardif_debuginfo_##_name = { \
}, \
}
-static BATADV_HARDIF_DEBUGINFO(originators, S_IRUGO,
+static BATADV_HARDIF_DEBUGINFO(originators, 0444,
batadv_originators_hardif_open);
static struct batadv_debuginfo *batadv_hardif_debuginfos[] = {
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index e49121ee55f6..9c5d4a65b98c 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index e257efdc5d03..1bfd1dbc2feb 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
*
* Antonio Quartulli
*
@@ -369,12 +369,11 @@ out:
* batadv_dbg_arp - print a debug message containing all the ARP packet details
* @bat_priv: the bat priv with all the soft interface information
* @skb: ARP packet
- * @type: ARP type
* @hdr_size: size of the possible header before the ARP packet
* @msg: message to print together with the debugging information
*/
static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
- u16 type, int hdr_size, char *msg)
+ int hdr_size, char *msg)
{
struct batadv_unicast_4addr_packet *unicast_4addr_packet;
struct batadv_bcast_packet *bcast_pkt;
@@ -441,7 +440,7 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
#else
static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
- u16 type, int hdr_size, char *msg)
+ int hdr_size, char *msg)
{
}
@@ -950,6 +949,41 @@ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size)
}
/**
+ * batadv_dat_arp_create_reply - create an ARP Reply
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ip_src: ARP sender IP
+ * @ip_dst: ARP target IP
+ * @hw_src: Ethernet source and ARP sender MAC
+ * @hw_dst: Ethernet destination and ARP target MAC
+ * @vid: VLAN identifier (optional, set to zero otherwise)
+ *
+ * Creates an ARP Reply from the given values, optionally encapsulated in a
+ * VLAN header.
+ *
+ * Return: An skb containing an ARP Reply.
+ */
+static struct sk_buff *
+batadv_dat_arp_create_reply(struct batadv_priv *bat_priv, __be32 ip_src,
+ __be32 ip_dst, u8 *hw_src, u8 *hw_dst,
+ unsigned short vid)
+{
+ struct sk_buff *skb;
+
+ skb = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_dst, bat_priv->soft_iface,
+ ip_src, hw_dst, hw_src, hw_dst);
+ if (!skb)
+ return NULL;
+
+ skb_reset_mac_header(skb);
+
+ if (vid & BATADV_VLAN_HAS_TAG)
+ skb = vlan_insert_tag(skb, htons(ETH_P_8021Q),
+ vid & VLAN_VID_MASK);
+
+ return skb;
+}
+
+/**
* batadv_dat_snoop_outgoing_arp_request - snoop the ARP request and try to
* answer using DAT
* @bat_priv: the bat priv with all the soft interface information
@@ -983,8 +1017,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
if (type != ARPOP_REQUEST)
goto out;
- batadv_dbg_arp(bat_priv, skb, type, hdr_size,
- "Parsing outgoing ARP REQUEST");
+ batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing outgoing ARP REQUEST");
ip_src = batadv_arp_ip_src(skb, hdr_size);
hw_src = batadv_arp_hw_src(skb, hdr_size);
@@ -1007,25 +1040,16 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
goto out;
}
- skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src,
- bat_priv->soft_iface, ip_dst, hw_src,
- dat_entry->mac_addr, hw_src);
+ skb_new = batadv_dat_arp_create_reply(bat_priv, ip_dst, ip_src,
+ dat_entry->mac_addr,
+ hw_src, vid);
if (!skb_new)
goto out;
- if (vid & BATADV_VLAN_HAS_TAG) {
- skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q),
- vid & VLAN_VID_MASK);
- if (!skb_new)
- goto out;
- }
-
- skb_reset_mac_header(skb_new);
skb_new->protocol = eth_type_trans(skb_new,
bat_priv->soft_iface);
bat_priv->stats.rx_packets++;
bat_priv->stats.rx_bytes += skb->len + ETH_HLEN + hdr_size;
- bat_priv->soft_iface->last_rx = jiffies;
netif_rx(skb_new);
batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n");
@@ -1075,8 +1099,7 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
ip_src = batadv_arp_ip_src(skb, hdr_size);
ip_dst = batadv_arp_ip_dst(skb, hdr_size);
- batadv_dbg_arp(bat_priv, skb, type, hdr_size,
- "Parsing incoming ARP REQUEST");
+ batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing incoming ARP REQUEST");
batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid);
@@ -1084,25 +1107,11 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
if (!dat_entry)
goto out;
- skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src,
- bat_priv->soft_iface, ip_dst, hw_src,
- dat_entry->mac_addr, hw_src);
-
+ skb_new = batadv_dat_arp_create_reply(bat_priv, ip_dst, ip_src,
+ dat_entry->mac_addr, hw_src, vid);
if (!skb_new)
goto out;
- /* the rest of the TX path assumes that the mac_header offset pointing
- * to the inner Ethernet header has been set, therefore reset it now.
- */
- skb_reset_mac_header(skb_new);
-
- if (vid & BATADV_VLAN_HAS_TAG) {
- skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q),
- vid & VLAN_VID_MASK);
- if (!skb_new)
- goto out;
- }
-
/* To preserve backwards compatibility, the node has choose the outgoing
* format based on the incoming request packet type. The assumption is
* that a node not using the 4addr packet format doesn't support it.
@@ -1149,8 +1158,7 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
if (type != ARPOP_REPLY)
return;
- batadv_dbg_arp(bat_priv, skb, type, hdr_size,
- "Parsing outgoing ARP REPLY");
+ batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing outgoing ARP REPLY");
hw_src = batadv_arp_hw_src(skb, hdr_size);
ip_src = batadv_arp_ip_src(skb, hdr_size);
@@ -1195,8 +1203,7 @@ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
if (type != ARPOP_REPLY)
goto out;
- batadv_dbg_arp(bat_priv, skb, type, hdr_size,
- "Parsing incoming ARP REPLY");
+ batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing incoming ARP REPLY");
hw_src = batadv_arp_hw_src(skb, hdr_size);
ip_src = batadv_arp_ip_src(skb, hdr_size);
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index 813ecea96cf9..ec364a3c1c66 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
*
* Antonio Quartulli
*
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 0934730fb7ff..8f964beaac28 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
*
* Martin Hundebøll <martin@hundeboll.net>
*
@@ -20,6 +20,7 @@
#include <linux/atomic.h>
#include <linux/byteorder/generic.h>
+#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/fs.h>
#include <linux/if_ether.h>
@@ -42,17 +43,23 @@
/**
* batadv_frag_clear_chain - delete entries in the fragment buffer chain
* @head: head of chain with entries.
+ * @dropped: whether the chain is cleared because all fragments are dropped
*
* Free fragments in the passed hlist. Should be called with appropriate lock.
*/
-static void batadv_frag_clear_chain(struct hlist_head *head)
+static void batadv_frag_clear_chain(struct hlist_head *head, bool dropped)
{
struct batadv_frag_list_entry *entry;
struct hlist_node *node;
hlist_for_each_entry_safe(entry, node, head, list) {
hlist_del(&entry->list);
- kfree_skb(entry->skb);
+
+ if (dropped)
+ kfree_skb(entry->skb);
+ else
+ consume_skb(entry->skb);
+
kfree(entry);
}
}
@@ -73,7 +80,7 @@ void batadv_frag_purge_orig(struct batadv_orig_node *orig_node,
spin_lock_bh(&chain->lock);
if (!check_cb || check_cb(chain)) {
- batadv_frag_clear_chain(&chain->head);
+ batadv_frag_clear_chain(&chain->fragment_list, true);
chain->size = 0;
}
@@ -117,8 +124,8 @@ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain,
if (chain->seqno == seqno)
return false;
- if (!hlist_empty(&chain->head))
- batadv_frag_clear_chain(&chain->head);
+ if (!hlist_empty(&chain->fragment_list))
+ batadv_frag_clear_chain(&chain->fragment_list, true);
chain->size = 0;
chain->seqno = seqno;
@@ -176,7 +183,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node,
chain = &orig_node->fragments[bucket];
spin_lock_bh(&chain->lock);
if (batadv_frag_init_chain(chain, seqno)) {
- hlist_add_head(&frag_entry_new->list, &chain->head);
+ hlist_add_head(&frag_entry_new->list, &chain->fragment_list);
chain->size = skb->len - hdr_size;
chain->timestamp = jiffies;
chain->total_size = ntohs(frag_packet->total_size);
@@ -185,7 +192,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node,
}
/* Find the position for the new fragment. */
- hlist_for_each_entry(frag_entry_curr, &chain->head, list) {
+ hlist_for_each_entry(frag_entry_curr, &chain->fragment_list, list) {
/* Drop packet if fragment already exists. */
if (frag_entry_curr->no == frag_entry_new->no)
goto err_unlock;
@@ -220,11 +227,11 @@ out:
* exceeds the maximum size of one merged packet. Don't allow
* packets to have different total_size.
*/
- batadv_frag_clear_chain(&chain->head);
+ batadv_frag_clear_chain(&chain->fragment_list, true);
chain->size = 0;
} else if (ntohs(frag_packet->total_size) == chain->size) {
/* All fragments received. Hand over chain to caller. */
- hlist_move_list(&chain->head, chain_out);
+ hlist_move_list(&chain->fragment_list, chain_out);
chain->size = 0;
}
@@ -232,8 +239,10 @@ err_unlock:
spin_unlock_bh(&chain->lock);
err:
- if (!ret)
+ if (!ret) {
kfree(frag_entry_new);
+ kfree_skb(skb);
+ }
return ret;
}
@@ -252,8 +261,9 @@ batadv_frag_merge_packets(struct hlist_head *chain)
{
struct batadv_frag_packet *packet;
struct batadv_frag_list_entry *entry;
- struct sk_buff *skb_out = NULL;
+ struct sk_buff *skb_out;
int size, hdr_size = sizeof(struct batadv_frag_packet);
+ bool dropped = false;
/* Remove first entry, as this is the destination for the rest of the
* fragments.
@@ -270,6 +280,7 @@ batadv_frag_merge_packets(struct hlist_head *chain)
if (pskb_expand_head(skb_out, 0, size - skb_out->len, GFP_ATOMIC) < 0) {
kfree_skb(skb_out);
skb_out = NULL;
+ dropped = true;
goto free;
}
@@ -291,7 +302,7 @@ batadv_frag_merge_packets(struct hlist_head *chain)
free:
/* Locking is not needed, because 'chain' is not part of any orig. */
- batadv_frag_clear_chain(chain);
+ batadv_frag_clear_chain(chain, dropped);
return skb_out;
}
@@ -304,7 +315,7 @@ free:
*
* There are three possible outcomes: 1) Packet is merged: Return true and
* set *skb to merged packet; 2) Packet is buffered: Return true and set *skb
- * to NULL; 3) Error: Return false and leave skb as is.
+ * to NULL; 3) Error: Return false and free skb.
*
* Return: true when packet is merged or buffered, false when skb is not not
* used.
@@ -329,9 +340,9 @@ bool batadv_frag_skb_buffer(struct sk_buff **skb,
goto out_err;
out:
- *skb = skb_out;
ret = true;
out_err:
+ *skb = skb_out;
return ret;
}
@@ -352,7 +363,7 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb,
struct batadv_orig_node *orig_node_src)
{
struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
- struct batadv_orig_node *orig_node_dst = NULL;
+ struct batadv_orig_node *orig_node_dst;
struct batadv_neigh_node *neigh_node = NULL;
struct batadv_frag_packet *packet;
u16 total_size;
@@ -393,7 +404,7 @@ out:
* batadv_frag_create - create a fragment from skb
* @skb: skb to create fragment from
* @frag_head: header to use in new fragment
- * @mtu: size of new fragment
+ * @fragment_size: size of new fragment
*
* Split the passed skb into two fragments: A new one with size matching the
* passed mtu and the old one with the rest. The new skb contains data from the
@@ -403,11 +414,11 @@ out:
*/
static struct sk_buff *batadv_frag_create(struct sk_buff *skb,
struct batadv_frag_packet *frag_head,
- unsigned int mtu)
+ unsigned int fragment_size)
{
struct sk_buff *skb_fragment;
unsigned int header_size = sizeof(*frag_head);
- unsigned int fragment_size = mtu - header_size;
+ unsigned int mtu = fragment_size + header_size;
skb_fragment = netdev_alloc_skb(NULL, mtu + ETH_HLEN);
if (!skb_fragment)
@@ -433,8 +444,7 @@ err:
* @orig_node: final destination of the created fragments
* @neigh_node: next-hop of the created fragments
*
- * Return: the netdev tx status or -1 in case of error.
- * When -1 is returned the skb is not consumed.
+ * Return: the netdev tx status or a negative errno code on a failure
*/
int batadv_frag_send_packet(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
@@ -446,24 +456,33 @@ int batadv_frag_send_packet(struct sk_buff *skb,
struct sk_buff *skb_fragment;
unsigned int mtu = neigh_node->if_incoming->net_dev->mtu;
unsigned int header_size = sizeof(frag_header);
- unsigned int max_fragment_size, max_packet_size;
- int ret = -1;
+ unsigned int max_fragment_size, num_fragments;
+ int ret;
/* To avoid merge and refragmentation at next-hops we never send
* fragments larger than BATADV_FRAG_MAX_FRAG_SIZE
*/
mtu = min_t(unsigned int, mtu, BATADV_FRAG_MAX_FRAG_SIZE);
max_fragment_size = mtu - header_size;
- max_packet_size = max_fragment_size * BATADV_FRAG_MAX_FRAGMENTS;
+
+ if (skb->len == 0 || max_fragment_size == 0)
+ return -EINVAL;
+
+ num_fragments = (skb->len - 1) / max_fragment_size + 1;
+ max_fragment_size = (skb->len - 1) / num_fragments + 1;
/* Don't even try to fragment, if we need more than 16 fragments */
- if (skb->len > max_packet_size)
- goto out;
+ if (num_fragments > BATADV_FRAG_MAX_FRAGMENTS) {
+ ret = -EAGAIN;
+ goto free_skb;
+ }
bat_priv = orig_node->bat_priv;
primary_if = batadv_primary_if_get_selected(bat_priv);
- if (!primary_if)
- goto out;
+ if (!primary_if) {
+ ret = -EINVAL;
+ goto free_skb;
+ }
/* Create one header to be copied to all fragments */
frag_header.packet_type = BATADV_UNICAST_FRAG;
@@ -487,35 +506,37 @@ int batadv_frag_send_packet(struct sk_buff *skb,
/* Eat and send fragments from the tail of skb */
while (skb->len > max_fragment_size) {
- skb_fragment = batadv_frag_create(skb, &frag_header, mtu);
- if (!skb_fragment)
- goto out;
+ /* The initial check in this function should cover this case */
+ if (unlikely(frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1)) {
+ ret = -EINVAL;
+ goto put_primary_if;
+ }
+
+ skb_fragment = batadv_frag_create(skb, &frag_header,
+ max_fragment_size);
+ if (!skb_fragment) {
+ ret = -ENOMEM;
+ goto put_primary_if;
+ }
batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX);
batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
skb_fragment->len + ETH_HLEN);
ret = batadv_send_unicast_skb(skb_fragment, neigh_node);
if (ret != NET_XMIT_SUCCESS) {
- /* return -1 so that the caller can free the original
- * skb
- */
- ret = -1;
- goto out;
+ ret = NET_XMIT_DROP;
+ goto put_primary_if;
}
frag_header.no++;
-
- /* The initial check in this function should cover this case */
- if (frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1) {
- ret = -1;
- goto out;
- }
}
/* Make room for the fragment header. */
if (batadv_skb_head_push(skb, header_size) < 0 ||
- pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0)
- goto out;
+ pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0) {
+ ret = -ENOMEM;
+ goto put_primary_if;
+ }
memcpy(skb->data, &frag_header, header_size);
@@ -524,10 +545,13 @@ int batadv_frag_send_packet(struct sk_buff *skb,
batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
skb->len + ETH_HLEN);
ret = batadv_send_unicast_skb(skb, neigh_node);
+ /* skb was consumed */
+ skb = NULL;
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+put_primary_if:
+ batadv_hardif_put(primary_if);
+free_skb:
+ kfree_skb(skb);
return ret;
}
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index 3202fe329e63..1a2d6c308745 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
*
* Martin Hundebøll <martin@hundeboll.net>
*
@@ -47,7 +47,7 @@ int batadv_frag_send_packet(struct sk_buff *skb,
static inline bool
batadv_frag_check_entry(struct batadv_frag_table_entry *frags_entry)
{
- if (!hlist_empty(&frags_entry->head) &&
+ if (!hlist_empty(&frags_entry->fragment_list) &&
batadv_has_timed_out(frags_entry->timestamp, BATADV_FRAG_TIMEOUT))
return true;
return false;
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index de055d64debe..de9955d5224d 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -348,7 +348,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
spin_lock_bh(&bat_priv->gw.list_lock);
kref_get(&gw_node->refcount);
- hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.list);
+ hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.gateway_list);
spin_unlock_bh(&bat_priv->gw.list_lock);
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
@@ -376,7 +376,8 @@ struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv,
struct batadv_gw_node *gw_node_tmp, *gw_node = NULL;
rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node_tmp, &bat_priv->gw.list, list) {
+ hlist_for_each_entry_rcu(gw_node_tmp, &bat_priv->gw.gateway_list,
+ list) {
if (gw_node_tmp->orig_node != orig_node)
continue;
@@ -475,7 +476,7 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv)
spin_lock_bh(&bat_priv->gw.list_lock);
hlist_for_each_entry_safe(gw_node, node_tmp,
- &bat_priv->gw.list, list) {
+ &bat_priv->gw.gateway_list, list) {
hlist_del_init_rcu(&gw_node->list);
batadv_gw_node_put(gw_node);
}
@@ -704,7 +705,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv,
{
struct batadv_neigh_node *neigh_curr = NULL;
struct batadv_neigh_node *neigh_old = NULL;
- struct batadv_orig_node *orig_dst_node = NULL;
+ struct batadv_orig_node *orig_dst_node;
struct batadv_gw_node *gw_node = NULL;
struct batadv_gw_node *curr_gw = NULL;
struct batadv_neigh_ifinfo *curr_ifinfo, *old_ifinfo;
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 859166d03561..3baa3d466e5e 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index 21184810d89f..33940c5c74a8 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -253,6 +253,11 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
*/
void batadv_gw_init(struct batadv_priv *bat_priv)
{
+ if (bat_priv->algo_ops->gw.init_sel_class)
+ bat_priv->algo_ops->gw.init_sel_class(bat_priv);
+ else
+ atomic_set(&bat_priv->gw.sel_class, 1);
+
batadv_tvlv_handler_register(bat_priv, batadv_gw_tvlv_ogm_handler_v1,
NULL, BATADV_TVLV_GW, 1,
BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index 8a5e1ddf1175..0a6a97d201f2 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 08ce36147c4c..e348f76ea8c1 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -92,8 +92,8 @@ out:
*
* Return: result of rtnl_link_ops->get_link_net or @fallback_net
*/
-static const struct net *batadv_getlink_net(const struct net_device *netdev,
- const struct net *fallback_net)
+static struct net *batadv_getlink_net(const struct net_device *netdev,
+ struct net *fallback_net)
{
if (!netdev->rtnl_link_ops)
return fallback_net;
@@ -116,9 +116,9 @@ static const struct net *batadv_getlink_net(const struct net_device *netdev,
* Return: true if the devices are each others parent, otherwise false
*/
static bool batadv_mutual_parents(const struct net_device *dev1,
- const struct net *net1,
+ struct net *net1,
const struct net_device *dev2,
- const struct net *net2)
+ struct net *net2)
{
int dev1_parent_iflink = dev_get_iflink(dev1);
int dev2_parent_iflink = dev_get_iflink(dev2);
@@ -154,7 +154,7 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
{
struct net *net = dev_net(net_dev);
struct net_device *parent_dev;
- const struct net *parent_net;
+ struct net *parent_net;
bool ret;
/* check if this is a batman-adv mesh interface */
@@ -202,13 +202,77 @@ static bool batadv_is_valid_iface(const struct net_device *net_dev)
}
/**
- * batadv_is_wifi_netdev - check if the given net_device struct is a wifi
- * interface
+ * batadv_get_real_netdevice - check if the given netdev struct is a virtual
+ * interface on top of another 'real' interface
+ * @netdev: the device to check
+ *
+ * Callers must hold the rtnl semaphore. You may want batadv_get_real_netdev()
+ * instead of this.
+ *
+ * Return: the 'real' net device or the original net device and NULL in case
+ * of an error.
+ */
+static struct net_device *batadv_get_real_netdevice(struct net_device *netdev)
+{
+ struct batadv_hard_iface *hard_iface = NULL;
+ struct net_device *real_netdev = NULL;
+ struct net *real_net;
+ struct net *net;
+ int ifindex;
+
+ ASSERT_RTNL();
+
+ if (!netdev)
+ return NULL;
+
+ if (netdev->ifindex == dev_get_iflink(netdev)) {
+ dev_hold(netdev);
+ return netdev;
+ }
+
+ hard_iface = batadv_hardif_get_by_netdev(netdev);
+ if (!hard_iface || !hard_iface->soft_iface)
+ goto out;
+
+ net = dev_net(hard_iface->soft_iface);
+ ifindex = dev_get_iflink(netdev);
+ real_net = batadv_getlink_net(netdev, net);
+ real_netdev = dev_get_by_index(real_net, ifindex);
+
+out:
+ if (hard_iface)
+ batadv_hardif_put(hard_iface);
+ return real_netdev;
+}
+
+/**
+ * batadv_get_real_netdev - check if the given net_device struct is a virtual
+ * interface on top of another 'real' interface
* @net_device: the device to check
*
- * Return: true if the net device is a 802.11 wireless device, false otherwise.
+ * Return: the 'real' net device or the original net device and NULL in case
+ * of an error.
*/
-bool batadv_is_wifi_netdev(struct net_device *net_device)
+struct net_device *batadv_get_real_netdev(struct net_device *net_device)
+{
+ struct net_device *real_netdev;
+
+ rtnl_lock();
+ real_netdev = batadv_get_real_netdevice(net_device);
+ rtnl_unlock();
+
+ return real_netdev;
+}
+
+/**
+ * batadv_is_wext_netdev - check if the given net_device struct is a
+ * wext wifi interface
+ * @net_device: the device to check
+ *
+ * Return: true if the net device is a wext wireless device, false
+ * otherwise.
+ */
+static bool batadv_is_wext_netdev(struct net_device *net_device)
{
if (!net_device)
return false;
@@ -221,6 +285,22 @@ bool batadv_is_wifi_netdev(struct net_device *net_device)
return true;
#endif
+ return false;
+}
+
+/**
+ * batadv_is_cfg80211_netdev - check if the given net_device struct is a
+ * cfg80211 wifi interface
+ * @net_device: the device to check
+ *
+ * Return: true if the net device is a cfg80211 wireless device, false
+ * otherwise.
+ */
+static bool batadv_is_cfg80211_netdev(struct net_device *net_device)
+{
+ if (!net_device)
+ return false;
+
/* cfg80211 drivers have to set ieee80211_ptr */
if (net_device->ieee80211_ptr)
return true;
@@ -228,6 +308,125 @@ bool batadv_is_wifi_netdev(struct net_device *net_device)
return false;
}
+/**
+ * batadv_wifi_flags_evaluate - calculate wifi flags for net_device
+ * @net_device: the device to check
+ *
+ * Return: batadv_hard_iface_wifi_flags flags of the device
+ */
+static u32 batadv_wifi_flags_evaluate(struct net_device *net_device)
+{
+ u32 wifi_flags = 0;
+ struct net_device *real_netdev;
+
+ if (batadv_is_wext_netdev(net_device))
+ wifi_flags |= BATADV_HARDIF_WIFI_WEXT_DIRECT;
+
+ if (batadv_is_cfg80211_netdev(net_device))
+ wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT;
+
+ real_netdev = batadv_get_real_netdevice(net_device);
+ if (!real_netdev)
+ return wifi_flags;
+
+ if (real_netdev == net_device)
+ goto out;
+
+ if (batadv_is_wext_netdev(real_netdev))
+ wifi_flags |= BATADV_HARDIF_WIFI_WEXT_INDIRECT;
+
+ if (batadv_is_cfg80211_netdev(real_netdev))
+ wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_INDIRECT;
+
+out:
+ dev_put(real_netdev);
+ return wifi_flags;
+}
+
+/**
+ * batadv_is_cfg80211_hardif - check if the given hardif is a cfg80211 wifi
+ * interface
+ * @hard_iface: the device to check
+ *
+ * Return: true if the net device is a cfg80211 wireless device, false
+ * otherwise.
+ */
+bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface)
+{
+ u32 allowed_flags = 0;
+
+ allowed_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT;
+ allowed_flags |= BATADV_HARDIF_WIFI_CFG80211_INDIRECT;
+
+ return !!(hard_iface->wifi_flags & allowed_flags);
+}
+
+/**
+ * batadv_is_wifi_hardif - check if the given hardif is a wifi interface
+ * @hard_iface: the device to check
+ *
+ * Return: true if the net device is a 802.11 wireless device, false otherwise.
+ */
+bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface)
+{
+ if (!hard_iface)
+ return false;
+
+ return hard_iface->wifi_flags != 0;
+}
+
+/**
+ * batadv_hardif_no_broadcast - check whether (re)broadcast is necessary
+ * @if_outgoing: the outgoing interface checked and considered for (re)broadcast
+ * @orig_addr: the originator of this packet
+ * @orig_neigh: originator address of the forwarder we just got the packet from
+ * (NULL if we originated)
+ *
+ * Checks whether a packet needs to be (re)broadcasted on the given interface.
+ *
+ * Return:
+ * BATADV_HARDIF_BCAST_NORECIPIENT: No neighbor on interface
+ * BATADV_HARDIF_BCAST_DUPFWD: Just one neighbor, but it is the forwarder
+ * BATADV_HARDIF_BCAST_DUPORIG: Just one neighbor, but it is the originator
+ * BATADV_HARDIF_BCAST_OK: Several neighbors, must broadcast
+ */
+int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing,
+ u8 *orig_addr, u8 *orig_neigh)
+{
+ struct batadv_hardif_neigh_node *hardif_neigh;
+ struct hlist_node *first;
+ int ret = BATADV_HARDIF_BCAST_OK;
+
+ rcu_read_lock();
+
+ /* 0 neighbors -> no (re)broadcast */
+ first = rcu_dereference(hlist_first_rcu(&if_outgoing->neigh_list));
+ if (!first) {
+ ret = BATADV_HARDIF_BCAST_NORECIPIENT;
+ goto out;
+ }
+
+ /* >1 neighbors -> (re)brodcast */
+ if (rcu_dereference(hlist_next_rcu(first)))
+ goto out;
+
+ hardif_neigh = hlist_entry(first, struct batadv_hardif_neigh_node,
+ list);
+
+ /* 1 neighbor, is the originator -> no rebroadcast */
+ if (orig_addr && batadv_compare_eth(hardif_neigh->orig, orig_addr)) {
+ ret = BATADV_HARDIF_BCAST_DUPORIG;
+ /* 1 neighbor, is the one we received from -> no rebroadcast */
+ } else if (orig_neigh &&
+ batadv_compare_eth(hardif_neigh->orig, orig_neigh)) {
+ ret = BATADV_HARDIF_BCAST_DUPFWD;
+ }
+
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
static struct batadv_hard_iface *
batadv_hardif_get_active(const struct net_device *soft_iface)
{
@@ -697,7 +896,8 @@ batadv_hardif_add_interface(struct net_device *net_dev)
kref_init(&hard_iface->refcount);
hard_iface->num_bcasts = BATADV_NUM_BCASTS_DEFAULT;
- if (batadv_is_wifi_netdev(net_dev))
+ hard_iface->wifi_flags = batadv_wifi_flags_evaluate(net_dev);
+ if (batadv_is_wifi_hardif(hard_iface))
hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
batadv_v_hardif_init(hard_iface);
@@ -806,6 +1006,11 @@ static int batadv_hard_if_event(struct notifier_block *this,
if (hard_iface == primary_if)
batadv_primary_if_update_addr(bat_priv, NULL);
break;
+ case NETDEV_CHANGEUPPER:
+ hard_iface->wifi_flags = batadv_wifi_flags_evaluate(net_dev);
+ if (batadv_is_wifi_hardif(hard_iface))
+ hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
+ break;
default:
break;
}
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index a76724d369bf..9f9890ff7a22 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -40,6 +40,20 @@ enum batadv_hard_if_state {
};
/**
+ * enum batadv_hard_if_bcast - broadcast avoidance options
+ * @BATADV_HARDIF_BCAST_OK: Do broadcast on according hard interface
+ * @BATADV_HARDIF_BCAST_NORECIPIENT: Broadcast not needed, there is no recipient
+ * @BATADV_HARDIF_BCAST_DUPFWD: There is just the neighbor we got it from
+ * @BATADV_HARDIF_BCAST_DUPORIG: There is just the originator
+ */
+enum batadv_hard_if_bcast {
+ BATADV_HARDIF_BCAST_OK = 0,
+ BATADV_HARDIF_BCAST_NORECIPIENT,
+ BATADV_HARDIF_BCAST_DUPFWD,
+ BATADV_HARDIF_BCAST_DUPORIG,
+};
+
+/**
* enum batadv_hard_if_cleanup - Cleanup modi for soft_iface after slave removal
* @BATADV_IF_CLEANUP_KEEP: Don't automatically delete soft-interface
* @BATADV_IF_CLEANUP_AUTO: Delete soft-interface after last slave was removed
@@ -51,8 +65,9 @@ enum batadv_hard_if_cleanup {
extern struct notifier_block batadv_hard_if_notifier;
-bool batadv_is_wifi_netdev(struct net_device *net_device);
-bool batadv_is_wifi_iface(int ifindex);
+struct net_device *batadv_get_real_netdev(struct net_device *net_device);
+bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface);
+bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface);
struct batadv_hard_iface*
batadv_hardif_get_by_netdev(const struct net_device *net_dev);
int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
@@ -63,6 +78,8 @@ void batadv_hardif_remove_interfaces(void);
int batadv_hardif_min_mtu(struct net_device *soft_iface);
void batadv_update_min_mtu(struct net_device *soft_iface);
void batadv_hardif_release(struct kref *ref);
+int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing,
+ u8 *orig_addr, u8 *orig_neigh);
/**
* batadv_hardif_put - decrement the hard interface refcounter and possibly
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index a0a0fdb85805..b5f7e13918ac 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index cbbf87075f06..0c905e91c5e2 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
@@ -61,36 +61,6 @@ void batadv_hash_set_lock_class(struct batadv_hashtable *hash,
/* free only the hashtable and the hash itself. */
void batadv_hash_destroy(struct batadv_hashtable *hash);
-/* remove the hash structure. if hashdata_free_cb != NULL, this function will be
- * called to remove the elements inside of the hash. if you don't remove the
- * elements, memory might be leaked.
- */
-static inline void batadv_hash_delete(struct batadv_hashtable *hash,
- batadv_hashdata_free_cb free_cb,
- void *arg)
-{
- struct hlist_head *head;
- struct hlist_node *node, *node_tmp;
- spinlock_t *list_lock; /* spinlock to protect write access */
- u32 i;
-
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
- list_lock = &hash->list_locks[i];
-
- spin_lock_bh(list_lock);
- hlist_for_each_safe(node, node_tmp, head) {
- hlist_del_rcu(node);
-
- if (free_cb)
- free_cb(node, arg);
- }
- spin_unlock_bh(list_lock);
- }
-
- batadv_hash_destroy(hash);
-}
-
/**
* batadv_hash_add - adds data to the hashtable
* @hash: storage hash table
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index 378cc1119d66..6308c9f0fd96 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -38,7 +38,6 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/stat.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/uaccess.h>
@@ -322,8 +321,8 @@ int batadv_socket_setup(struct batadv_priv *bat_priv)
if (!bat_priv->debug_dir)
goto err;
- d = debugfs_create_file(BATADV_ICMP_SOCKET, S_IFREG | S_IWUSR | S_IRUSR,
- bat_priv->debug_dir, bat_priv, &batadv_fops);
+ d = debugfs_create_file(BATADV_ICMP_SOCKET, 0600, bat_priv->debug_dir,
+ bat_priv, &batadv_fops);
if (!d)
goto err;
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index e44a7da51431..f3fec40aae86 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index 56dc532f7a2c..4ef4bde2cc2d 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -31,7 +31,6 @@
#include <linux/sched.h> /* for linux/wait.h */
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/stat.h>
#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/uaccess.h>
@@ -212,8 +211,7 @@ int batadv_debug_log_setup(struct batadv_priv *bat_priv)
spin_lock_init(&bat_priv->debug_log->lock);
init_waitqueue_head(&bat_priv->debug_log->queue_wait);
- d = debugfs_create_file("log", S_IFREG | S_IRUSR,
- bat_priv->debug_dir, bat_priv,
+ d = debugfs_create_file("log", 0400, bat_priv->debug_dir, bat_priv,
&batadv_log_fops);
if (!d)
goto err;
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
index d2905a855d1b..7a2b9f4da078 100644
--- a/net/batman-adv/log.h
+++ b/net/batman-adv/log.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -71,12 +71,12 @@ int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
__printf(2, 3);
/* possibly ratelimited debug output */
-#define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \
- do { \
- if (atomic_read(&bat_priv->log_level) & type && \
- (!ratelimited || net_ratelimit())) \
- batadv_debug_log(bat_priv, fmt, ## arg);\
- } \
+#define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \
+ do { \
+ if (atomic_read(&(bat_priv)->log_level) & (type) && \
+ (!(ratelimited) || net_ratelimit())) \
+ batadv_debug_log(bat_priv, fmt, ## arg); \
+ } \
while (0)
#else /* !CONFIG_BATMAN_ADV_DEBUG */
__printf(4, 5)
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 2c017ab47557..5000c540614d 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -23,6 +23,7 @@
#include <linux/crc32c.h>
#include <linux/errno.h>
#include <linux/fs.h>
+#include <linux/genetlink.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/init.h>
@@ -44,6 +45,7 @@
#include <linux/workqueue.h>
#include <net/dsfield.h>
#include <net/rtnetlink.h>
+#include <uapi/linux/batman_adv.h>
#include "bat_algo.h"
#include "bat_iv_ogm.h"
@@ -160,7 +162,7 @@ int batadv_mesh_init(struct net_device *soft_iface)
INIT_HLIST_HEAD(&bat_priv->forw_bat_list);
INIT_HLIST_HEAD(&bat_priv->forw_bcast_list);
- INIT_HLIST_HEAD(&bat_priv->gw.list);
+ INIT_HLIST_HEAD(&bat_priv->gw.gateway_list);
#ifdef CONFIG_BATMAN_ADV_MCAST
INIT_HLIST_HEAD(&bat_priv->mcast.want_all_unsnoopables_list);
INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv4_list);
@@ -402,6 +404,8 @@ void batadv_skb_set_priority(struct sk_buff *skb, int offset)
static int batadv_recv_unhandled_packet(struct sk_buff *skb,
struct batadv_hard_iface *recv_if)
{
+ kfree_skb(skb);
+
return NET_RX_DROP;
}
@@ -416,7 +420,6 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
struct batadv_ogm_packet *batadv_ogm_packet;
struct batadv_hard_iface *hard_iface;
u8 idx;
- int ret;
hard_iface = container_of(ptype, struct batadv_hard_iface,
batman_adv_ptype);
@@ -466,14 +469,8 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
/* reset control block to avoid left overs from previous users */
memset(skb->cb, 0, sizeof(struct batadv_skb_cb));
- /* all receive handlers return whether they received or reused
- * the supplied skb. if not, we have to free the skb.
- */
idx = batadv_ogm_packet->packet_type;
- ret = (*batadv_rx_handler[idx])(skb, hard_iface);
-
- if (ret == NET_RX_DROP)
- kfree_skb(skb);
+ (*batadv_rx_handler[idx])(skb, hard_iface);
batadv_hardif_put(hard_iface);
@@ -653,3 +650,4 @@ MODULE_DESCRIPTION(BATADV_DRIVER_DESC);
MODULE_SUPPORTED_DEVICE(BATADV_DRIVER_DEVICE);
MODULE_VERSION(BATADV_SOURCE_VERSION);
MODULE_ALIAS_RTNL_LINK("batadv");
+MODULE_ALIAS_GENL_FAMILY(BATADV_NL_NAME);
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 09af21e27639..57a8103dbce7 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -24,7 +24,7 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2016.4"
+#define BATADV_SOURCE_VERSION "2017.0"
#endif
/* B.A.T.M.A.N. parameters */
@@ -48,6 +48,7 @@
#define BATADV_TT_CLIENT_TEMP_TIMEOUT 600000 /* in milliseconds */
#define BATADV_TT_WORK_PERIOD 5000 /* 5 seconds */
#define BATADV_ORIG_WORK_PERIOD 1000 /* 1 second */
+#define BATADV_MCAST_WORK_PERIOD 500 /* 0.5 seconds */
#define BATADV_DAT_ENTRY_TIMEOUT (5 * 60000) /* 5 mins in milliseconds */
/* sliding packet range of received originator messages in sequence numbers
* (should be a multiple of our word size)
@@ -185,7 +186,6 @@ enum batadv_uev_type {
#include <linux/bitops.h> /* for packet.h */
#include <linux/compiler.h>
-#include <linux/cpumask.h>
#include <linux/etherdevice.h>
#include <linux/if_ether.h> /* for packet.h */
#include <linux/if_vlan.h>
@@ -200,8 +200,8 @@ struct packet_type;
struct seq_file;
struct sk_buff;
-#define BATADV_PRINT_VID(vid) ((vid & BATADV_VLAN_HAS_TAG) ? \
- (int)(vid & VLAN_VID_MASK) : -1)
+#define BATADV_PRINT_VID(vid) (((vid) & BATADV_VLAN_HAS_TAG) ? \
+ (int)((vid) & VLAN_VID_MASK) : -1)
extern struct list_head batadv_hardif_list;
@@ -284,26 +284,6 @@ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx,
#define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1)
-/**
- * batadv_sum_counter - Sum the cpu-local counters for index 'idx'
- * @bat_priv: the bat priv with all the soft interface information
- * @idx: index of counter to sum up
- *
- * Return: sum of all cpu-local counters
- */
-static inline u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx)
-{
- u64 *counters, sum = 0;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- counters = per_cpu_ptr(bat_priv->bat_counters, cpu);
- sum += counters[idx];
- }
-
- return sum;
-}
-
/* Define a macro to reach the control buffer of the skb. The members of the
* control buffer are defined in struct batadv_skb_cb in types.h.
* The macro is inspired by the similar macro TCP_SKB_CB() in tcp.h.
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 13661f43386f..952ba81a565b 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors:
*
* Linus Lüssing
*
@@ -33,6 +33,7 @@
#include <linux/in6.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
+#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
@@ -48,6 +49,7 @@
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/types.h>
+#include <linux/workqueue.h>
#include <net/addrconf.h>
#include <net/if_inet6.h>
#include <net/ip.h>
@@ -60,6 +62,18 @@
#include "translation-table.h"
#include "tvlv.h"
+static void batadv_mcast_mla_update(struct work_struct *work);
+
+/**
+ * batadv_mcast_start_timer - schedule the multicast periodic worker
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_mcast_start_timer(struct batadv_priv *bat_priv)
+{
+ queue_delayed_work(batadv_event_workqueue, &bat_priv->mcast.work,
+ msecs_to_jiffies(BATADV_MCAST_WORK_PERIOD));
+}
+
/**
* batadv_mcast_get_bridge - get the bridge on top of the softif if it exists
* @soft_iface: netdev struct of the mesh interface
@@ -231,19 +245,15 @@ out:
/**
* batadv_mcast_mla_list_free - free a list of multicast addresses
- * @bat_priv: the bat priv with all the soft interface information
* @mcast_list: the list to free
*
* Removes and frees all items in the given mcast_list.
*/
-static void batadv_mcast_mla_list_free(struct batadv_priv *bat_priv,
- struct hlist_head *mcast_list)
+static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list)
{
struct batadv_hw_addr *mcast_entry;
struct hlist_node *tmp;
- lockdep_assert_held(&bat_priv->tt.commit_lock);
-
hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) {
hlist_del(&mcast_entry->list);
kfree(mcast_entry);
@@ -259,6 +269,8 @@ static void batadv_mcast_mla_list_free(struct batadv_priv *bat_priv,
* translation table except the ones listed in the given mcast_list.
*
* If mcast_list is NULL then all are retracted.
+ *
+ * Do not call outside of the mcast worker! (or cancel mcast worker first)
*/
static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv,
struct hlist_head *mcast_list)
@@ -266,7 +278,7 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv,
struct batadv_hw_addr *mcast_entry;
struct hlist_node *tmp;
- lockdep_assert_held(&bat_priv->tt.commit_lock);
+ WARN_ON(delayed_work_pending(&bat_priv->mcast.work));
hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list,
list) {
@@ -291,6 +303,8 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv,
*
* Adds multicast listener announcements from the given mcast_list to the
* translation table if they have not been added yet.
+ *
+ * Do not call outside of the mcast worker! (or cancel mcast worker first)
*/
static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
struct hlist_head *mcast_list)
@@ -298,7 +312,7 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
struct batadv_hw_addr *mcast_entry;
struct hlist_node *tmp;
- lockdep_assert_held(&bat_priv->tt.commit_lock);
+ WARN_ON(delayed_work_pending(&bat_priv->mcast.work));
if (!mcast_list)
return;
@@ -532,13 +546,18 @@ update:
}
/**
- * batadv_mcast_mla_update - update the own MLAs
+ * __batadv_mcast_mla_update - update the own MLAs
* @bat_priv: the bat priv with all the soft interface information
*
* Updates the own multicast listener announcements in the translation
* table as well as the own, announced multicast tvlv container.
+ *
+ * Note that non-conflicting reads and writes to bat_priv->mcast.mla_list
+ * in batadv_mcast_mla_tt_retract() and batadv_mcast_mla_tt_add() are
+ * ensured by the non-parallel execution of the worker this function
+ * belongs to.
*/
-void batadv_mcast_mla_update(struct batadv_priv *bat_priv)
+static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv)
{
struct net_device *soft_iface = bat_priv->soft_iface;
struct hlist_head mcast_list = HLIST_HEAD_INIT;
@@ -560,7 +579,30 @@ update:
batadv_mcast_mla_tt_add(bat_priv, &mcast_list);
out:
- batadv_mcast_mla_list_free(bat_priv, &mcast_list);
+ batadv_mcast_mla_list_free(&mcast_list);
+}
+
+/**
+ * batadv_mcast_mla_update - update the own MLAs
+ * @work: kernel work struct
+ *
+ * Updates the own multicast listener announcements in the translation
+ * table as well as the own, announced multicast tvlv container.
+ *
+ * In the end, reschedules the work timer.
+ */
+static void batadv_mcast_mla_update(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_priv_mcast *priv_mcast;
+ struct batadv_priv *bat_priv;
+
+ delayed_work = to_delayed_work(work);
+ priv_mcast = container_of(delayed_work, struct batadv_priv_mcast, work);
+ bat_priv = container_of(priv_mcast, struct batadv_priv, mcast);
+
+ __batadv_mcast_mla_update(bat_priv);
+ batadv_mcast_start_timer(bat_priv);
}
/**
@@ -1132,6 +1174,9 @@ void batadv_mcast_init(struct batadv_priv *bat_priv)
batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler,
NULL, BATADV_TVLV_MCAST, 2,
BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
+
+ INIT_DELAYED_WORK(&bat_priv->mcast.work, batadv_mcast_mla_update);
+ batadv_mcast_start_timer(bat_priv);
}
#ifdef CONFIG_BATMAN_ADV_DEBUGFS
@@ -1243,12 +1288,13 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset)
*/
void batadv_mcast_free(struct batadv_priv *bat_priv)
{
+ cancel_delayed_work_sync(&bat_priv->mcast.work);
+
batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 2);
batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 2);
- spin_lock_bh(&bat_priv->tt.commit_lock);
+ /* safely calling outside of worker, as worker was canceled above */
batadv_mcast_mla_tt_retract(bat_priv, NULL);
- spin_unlock_bh(&bat_priv->tt.commit_lock);
}
/**
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 1fb00ba84907..2a78cddab0e9 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors:
*
* Linus Lüssing
*
@@ -39,8 +39,6 @@ enum batadv_forw_mode {
#ifdef CONFIG_BATMAN_ADV_MCAST
-void batadv_mcast_mla_update(struct batadv_priv *bat_priv);
-
enum batadv_forw_mode
batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
struct batadv_orig_node **mcast_single_orig);
@@ -55,10 +53,6 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node);
#else
-static inline void batadv_mcast_mla_update(struct batadv_priv *bat_priv)
-{
-}
-
static inline enum batadv_forw_mode
batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
struct batadv_orig_node **mcast_single_orig)
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 64cb6acbe0a6..ab13b4d58733 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors:
*
* Matthias Schiffer
*
@@ -20,11 +20,14 @@
#include <linux/atomic.h>
#include <linux/byteorder/generic.h>
+#include <linux/cache.h>
#include <linux/errno.h>
+#include <linux/export.h>
#include <linux/fs.h>
#include <linux/genetlink.h>
#include <linux/if_ether.h>
#include <linux/init.h>
+#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/printk.h>
@@ -48,14 +51,7 @@
#include "tp_meter.h"
#include "translation-table.h"
-struct genl_family batadv_netlink_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = BATADV_NL_NAME,
- .version = 1,
- .maxattr = BATADV_ATTR_MAX,
- .netnsok = true,
-};
+struct genl_family batadv_netlink_family;
/* multicast groups */
enum batadv_netlink_multicast_groups {
@@ -534,7 +530,7 @@ batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb)
return msg->len;
}
-static struct genl_ops batadv_netlink_ops[] = {
+static const struct genl_ops batadv_netlink_ops[] = {
{
.cmd = BATADV_CMD_GET_MESH_INFO,
.flags = GENL_ADMIN_PERM,
@@ -610,6 +606,19 @@ static struct genl_ops batadv_netlink_ops[] = {
};
+struct genl_family batadv_netlink_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = BATADV_NL_NAME,
+ .version = 1,
+ .maxattr = BATADV_ATTR_MAX,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = batadv_netlink_ops,
+ .n_ops = ARRAY_SIZE(batadv_netlink_ops),
+ .mcgrps = batadv_netlink_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(batadv_netlink_mcgrps),
+};
+
/**
* batadv_netlink_register - register batadv genl netlink family
*/
@@ -617,9 +626,7 @@ void __init batadv_netlink_register(void)
{
int ret;
- ret = genl_register_family_with_ops_groups(&batadv_netlink_family,
- batadv_netlink_ops,
- batadv_netlink_mcgrps);
+ ret = genl_register_family(&batadv_netlink_family);
if (ret)
pr_warn("unable to register netlink family");
}
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index 52eb16281aba..f1cd8c5da966 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors:
*
* Matthias Schiffer
*
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index e3baf697a35c..e1f6fc72fe3e 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors:
*
* Martin Hundebøll, Jeppe Ledet-Pedersen
*
@@ -44,7 +44,6 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/stat.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/workqueue.h>
@@ -261,10 +260,16 @@ static void batadv_nc_path_put(struct batadv_nc_path *nc_path)
/**
* batadv_nc_packet_free - frees nc packet
* @nc_packet: the nc packet to free
+ * @dropped: whether the packet is freed because is is dropped
*/
-static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet)
+static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet,
+ bool dropped)
{
- kfree_skb(nc_packet->skb);
+ if (dropped)
+ kfree_skb(nc_packet->skb);
+ else
+ consume_skb(nc_packet->skb);
+
batadv_nc_path_put(nc_packet->nc_path);
kfree(nc_packet);
}
@@ -577,7 +582,7 @@ static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet)
{
batadv_send_unicast_skb(nc_packet->skb, nc_packet->neigh_node);
nc_packet->skb = NULL;
- batadv_nc_packet_free(nc_packet);
+ batadv_nc_packet_free(nc_packet, false);
}
/**
@@ -611,7 +616,7 @@ static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv,
/* purge nc packet */
list_del(&nc_packet->list);
- batadv_nc_packet_free(nc_packet);
+ batadv_nc_packet_free(nc_packet, true);
res = true;
@@ -1209,11 +1214,11 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
}
/* skb_src is now coded into skb_dest, so free it */
- kfree_skb(skb_src);
+ consume_skb(skb_src);
/* avoid duplicate free of skb from nc_packet */
nc_packet->skb = NULL;
- batadv_nc_packet_free(nc_packet);
+ batadv_nc_packet_free(nc_packet, false);
/* Send the coded packet and return true */
batadv_send_unicast_skb(skb_dest, first_dest);
@@ -1400,7 +1405,7 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv,
/* batadv_nc_skb_store_for_decoding() clones the skb, so we must free
* our ref
*/
- kfree_skb(skb);
+ consume_skb(skb);
}
/**
@@ -1724,7 +1729,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
ether_addr_copy(unicast_packet->dest, orig_dest);
unicast_packet->ttvn = ttvn;
- batadv_nc_packet_free(nc_packet);
+ batadv_nc_packet_free(nc_packet, false);
return unicast_packet;
}
@@ -1814,11 +1819,11 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
/* Check if network coding is enabled */
if (!atomic_read(&bat_priv->network_coding))
- return NET_RX_DROP;
+ goto free_skb;
/* Make sure we can access (and remove) header */
if (unlikely(!pskb_may_pull(skb, hdr_size)))
- return NET_RX_DROP;
+ goto free_skb;
coded_packet = (struct batadv_coded_packet *)skb->data;
ethhdr = eth_hdr(skb);
@@ -1826,7 +1831,7 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
/* Verify frame is destined for us */
if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest) &&
!batadv_is_my_mac(bat_priv, coded_packet->second_dest))
- return NET_RX_DROP;
+ goto free_skb;
/* Update stat counter */
if (batadv_is_my_mac(bat_priv, coded_packet->second_dest))
@@ -1836,7 +1841,7 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
coded_packet);
if (!nc_packet) {
batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED);
- return NET_RX_DROP;
+ goto free_skb;
}
/* Make skb's linear, because decoding accesses the entire buffer */
@@ -1861,7 +1866,10 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
return batadv_recv_unicast_packet(skb, recv_if);
free_nc_packet:
- batadv_nc_packet_free(nc_packet);
+ batadv_nc_packet_free(nc_packet, true);
+free_skb:
+ kfree_skb(skb);
+
return NET_RX_DROP;
}
@@ -1961,17 +1969,16 @@ int batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
if (!nc_dir)
goto out;
- file = debugfs_create_u8("min_tq", S_IRUGO | S_IWUSR, nc_dir,
- &bat_priv->nc.min_tq);
+ file = debugfs_create_u8("min_tq", 0644, nc_dir, &bat_priv->nc.min_tq);
if (!file)
goto out;
- file = debugfs_create_u32("max_fwd_delay", S_IRUGO | S_IWUSR, nc_dir,
+ file = debugfs_create_u32("max_fwd_delay", 0644, nc_dir,
&bat_priv->nc.max_fwd_delay);
if (!file)
goto out;
- file = debugfs_create_u32("max_buffer_time", S_IRUGO | S_IWUSR, nc_dir,
+ file = debugfs_create_u32("max_buffer_time", 0644, nc_dir,
&bat_priv->nc.max_buffer_time);
if (!file)
goto out;
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
index d6d7fb4ec5d5..c66efb81d2f4 100644
--- a/net/batman-adv/network-coding.h
+++ b/net/batman-adv/network-coding.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors:
*
* Martin Hundebøll, Jeppe Ledet-Pedersen
*
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 7c8d16086f0f..8e2a4b205257 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -364,7 +364,7 @@ struct batadv_orig_ifinfo *
batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
struct batadv_hard_iface *if_outgoing)
{
- struct batadv_orig_ifinfo *orig_ifinfo = NULL;
+ struct batadv_orig_ifinfo *orig_ifinfo;
unsigned long reset_time;
spin_lock_bh(&orig_node->neigh_list_lock);
@@ -512,15 +512,17 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
* batadv_hardif_neigh_create - create a hardif neighbour node
* @hard_iface: the interface this neighbour is connected to
* @neigh_addr: the interface address of the neighbour to retrieve
+ * @orig_node: originator object representing the neighbour
*
* Return: the hardif neighbour node if found or created or NULL otherwise.
*/
static struct batadv_hardif_neigh_node *
batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
- const u8 *neigh_addr)
+ const u8 *neigh_addr,
+ struct batadv_orig_node *orig_node)
{
struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
- struct batadv_hardif_neigh_node *hardif_neigh = NULL;
+ struct batadv_hardif_neigh_node *hardif_neigh;
spin_lock_bh(&hard_iface->neigh_list_lock);
@@ -536,6 +538,7 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
kref_get(&hard_iface->refcount);
INIT_HLIST_NODE(&hardif_neigh->list);
ether_addr_copy(hardif_neigh->addr, neigh_addr);
+ ether_addr_copy(hardif_neigh->orig, orig_node->orig);
hardif_neigh->if_incoming = hard_iface;
hardif_neigh->last_seen = jiffies;
@@ -556,21 +559,23 @@ out:
* node
* @hard_iface: the interface this neighbour is connected to
* @neigh_addr: the interface address of the neighbour to retrieve
+ * @orig_node: originator object representing the neighbour
*
* Return: the hardif neighbour node if found or created or NULL otherwise.
*/
static struct batadv_hardif_neigh_node *
batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface,
- const u8 *neigh_addr)
+ const u8 *neigh_addr,
+ struct batadv_orig_node *orig_node)
{
- struct batadv_hardif_neigh_node *hardif_neigh = NULL;
+ struct batadv_hardif_neigh_node *hardif_neigh;
/* first check without locking to avoid the overhead */
hardif_neigh = batadv_hardif_neigh_get(hard_iface, neigh_addr);
if (hardif_neigh)
return hardif_neigh;
- return batadv_hardif_neigh_create(hard_iface, neigh_addr);
+ return batadv_hardif_neigh_create(hard_iface, neigh_addr, orig_node);
}
/**
@@ -630,7 +635,7 @@ batadv_neigh_node_create(struct batadv_orig_node *orig_node,
goto out;
hardif_neigh = batadv_hardif_neigh_get_or_create(hard_iface,
- neigh_addr);
+ neigh_addr, orig_node);
if (!hardif_neigh)
goto out;
@@ -683,7 +688,7 @@ batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node,
struct batadv_hard_iface *hard_iface,
const u8 *neigh_addr)
{
- struct batadv_neigh_node *neigh_node = NULL;
+ struct batadv_neigh_node *neigh_node;
/* first check without locking to avoid the overhead */
neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr);
@@ -1021,7 +1026,7 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
batadv_orig_node_vlan_put(vlan);
for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) {
- INIT_HLIST_HEAD(&orig_node->fragments[i].head);
+ INIT_HLIST_HEAD(&orig_node->fragments[i].fragment_list);
spin_lock_init(&orig_node->fragments[i].lock);
orig_node->fragments[i].size = 0;
}
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index ebc56183f358..d94220a6d21a 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 6afc0b86950e..8e8a5db197cb 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -21,7 +21,7 @@
#include <asm/byteorder.h>
#include <linux/types.h>
-#define batadv_tp_is_error(n) ((u8)n > 127 ? 1 : 0)
+#define batadv_tp_is_error(n) ((u8)(n) > 127 ? 1 : 0)
/**
* enum batadv_packettype - types for batman-adv encapsulated packets
@@ -252,16 +252,6 @@ struct batadv_elp_packet {
#define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet)
/**
- * enum batadv_icmp_user_cmd_type - types for batman-adv icmp cmd modes
- * @BATADV_TP_START: start a throughput meter run
- * @BATADV_TP_STOP: stop a throughput meter run
- */
-enum batadv_icmp_user_cmd_type {
- BATADV_TP_START = 0,
- BATADV_TP_STOP = 2,
-};
-
-/**
* struct batadv_icmp_header - common members among all the ICMP packets
* @packet_type: batman-adv packet type, part of the general header
* @version: batman-adv protocol version, part of the genereal header
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 7e8dc648b95a..7fd740b6e36d 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -196,8 +196,8 @@ bool batadv_check_management_packet(struct sk_buff *skb,
if (!is_broadcast_ether_addr(ethhdr->h_dest))
return false;
- /* packet with broadcast sender address */
- if (is_broadcast_ether_addr(ethhdr->h_source))
+ /* packet with invalid sender address */
+ if (!is_valid_ether_addr(ethhdr->h_source))
return false;
/* create a copy of the skb, if needed, to modify it. */
@@ -262,11 +262,11 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv,
icmph->ttl = BATADV_TTL;
res = batadv_send_skb_to_orig(skb, orig_node, NULL);
- if (res == -1)
- goto out;
-
- ret = NET_RX_SUCCESS;
+ if (res == NET_XMIT_SUCCESS)
+ ret = NET_RX_SUCCESS;
+ /* skb was consumed */
+ skb = NULL;
break;
case BATADV_TP:
if (!pskb_may_pull(skb, sizeof(struct batadv_icmp_tp_packet)))
@@ -274,6 +274,8 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv,
batadv_tp_meter_recv(bat_priv, skb);
ret = NET_RX_SUCCESS;
+ /* skb was consumed */
+ skb = NULL;
goto out;
default:
/* drop unknown type */
@@ -284,6 +286,9 @@ out:
batadv_hardif_put(primary_if);
if (orig_node)
batadv_orig_node_put(orig_node);
+
+ kfree_skb(skb);
+
return ret;
}
@@ -325,14 +330,20 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv,
icmp_packet->ttl = BATADV_TTL;
res = batadv_send_skb_to_orig(skb, orig_node, NULL);
- if (res != -1)
- ret = NET_RX_SUCCESS;
+ if (res == NET_RX_SUCCESS)
+ ret = NET_XMIT_SUCCESS;
+
+ /* skb was consumed */
+ skb = NULL;
out:
if (primary_if)
batadv_hardif_put(primary_if);
if (orig_node)
batadv_orig_node_put(orig_node);
+
+ kfree_skb(skb);
+
return ret;
}
@@ -349,21 +360,21 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
/* drop packet if it has not necessary minimum size */
if (unlikely(!pskb_may_pull(skb, hdr_size)))
- goto out;
+ goto free_skb;
ethhdr = eth_hdr(skb);
- /* packet with unicast indication but broadcast recipient */
- if (is_broadcast_ether_addr(ethhdr->h_dest))
- goto out;
+ /* packet with unicast indication but non-unicast recipient */
+ if (!is_valid_ether_addr(ethhdr->h_dest))
+ goto free_skb;
- /* packet with broadcast sender address */
- if (is_broadcast_ether_addr(ethhdr->h_source))
- goto out;
+ /* packet with broadcast/multicast sender address */
+ if (is_multicast_ether_addr(ethhdr->h_source))
+ goto free_skb;
/* not for me */
if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest))
- goto out;
+ goto free_skb;
icmph = (struct batadv_icmp_header *)skb->data;
@@ -372,17 +383,17 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
icmph->msg_type == BATADV_ECHO_REQUEST) &&
(skb->len >= sizeof(struct batadv_icmp_packet_rr))) {
if (skb_linearize(skb) < 0)
- goto out;
+ goto free_skb;
/* create a copy of the skb, if needed, to modify it. */
if (skb_cow(skb, ETH_HLEN) < 0)
- goto out;
+ goto free_skb;
ethhdr = eth_hdr(skb);
icmph = (struct batadv_icmp_header *)skb->data;
icmp_packet_rr = (struct batadv_icmp_packet_rr *)icmph;
if (icmp_packet_rr->rr_cur >= BATADV_RR_LEN)
- goto out;
+ goto free_skb;
ether_addr_copy(icmp_packet_rr->rr[icmp_packet_rr->rr_cur],
ethhdr->h_dest);
@@ -400,11 +411,11 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
/* get routing information */
orig_node = batadv_orig_hash_find(bat_priv, icmph->dst);
if (!orig_node)
- goto out;
+ goto free_skb;
/* create a copy of the skb, if needed, to modify it. */
if (skb_cow(skb, ETH_HLEN) < 0)
- goto out;
+ goto put_orig_node;
icmph = (struct batadv_icmp_header *)skb->data;
@@ -413,12 +424,18 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
/* route it */
res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
- if (res != -1)
+ if (res == NET_XMIT_SUCCESS)
ret = NET_RX_SUCCESS;
-out:
+ /* skb was consumed */
+ skb = NULL;
+
+put_orig_node:
if (orig_node)
batadv_orig_node_put(orig_node);
+free_skb:
+ kfree_skb(skb);
+
return ret;
}
@@ -445,12 +462,12 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv,
ethhdr = eth_hdr(skb);
- /* packet with unicast indication but broadcast recipient */
- if (is_broadcast_ether_addr(ethhdr->h_dest))
+ /* packet with unicast indication but non-unicast recipient */
+ if (!is_valid_ether_addr(ethhdr->h_dest))
return -EBADR;
- /* packet with broadcast sender address */
- if (is_broadcast_ether_addr(ethhdr->h_source))
+ /* packet with broadcast/multicast sender address */
+ if (is_multicast_ether_addr(ethhdr->h_source))
return -EBADR;
/* not for me */
@@ -667,18 +684,18 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
if (unicast_packet->ttl < 2) {
pr_debug("Warning - can't forward unicast packet from %pM to %pM: ttl exceeded\n",
ethhdr->h_source, unicast_packet->dest);
- goto out;
+ goto free_skb;
}
/* get routing information */
orig_node = batadv_orig_hash_find(bat_priv, unicast_packet->dest);
if (!orig_node)
- goto out;
+ goto free_skb;
/* create a copy of the skb, if needed, to modify it. */
if (skb_cow(skb, ETH_HLEN) < 0)
- goto out;
+ goto put_orig_node;
/* decrement ttl */
unicast_packet = (struct batadv_unicast_packet *)skb->data;
@@ -702,22 +719,24 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
len = skb->len;
res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
- if (res == -1)
- goto out;
/* translate transmit result into receive result */
if (res == NET_XMIT_SUCCESS) {
+ ret = NET_RX_SUCCESS;
/* skb was transmitted and consumed */
batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD);
batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES,
len + ETH_HLEN);
}
- ret = NET_RX_SUCCESS;
+ /* skb was consumed */
+ skb = NULL;
+
+put_orig_node:
+ batadv_orig_node_put(orig_node);
+free_skb:
+ kfree_skb(skb);
-out:
- if (orig_node)
- batadv_orig_node_put(orig_node);
return ret;
}
@@ -902,14 +921,18 @@ int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb,
check = batadv_check_unicast_packet(bat_priv, skb, hdr_size);
if (check < 0)
- return NET_RX_DROP;
+ goto free_skb;
/* we don't know about this type, drop it. */
unicast_packet = (struct batadv_unicast_packet *)skb->data;
if (batadv_is_my_mac(bat_priv, unicast_packet->dest))
- return NET_RX_DROP;
+ goto free_skb;
return batadv_route_unicast_packet(skb, recv_if);
+
+free_skb:
+ kfree_skb(skb);
+ return NET_RX_DROP;
}
int batadv_recv_unicast_packet(struct sk_buff *skb,
@@ -923,6 +946,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
int check, hdr_size = sizeof(*unicast_packet);
enum batadv_subtype subtype;
bool is4addr;
+ int ret = NET_RX_DROP;
unicast_packet = (struct batadv_unicast_packet *)skb->data;
unicast_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data;
@@ -942,9 +966,9 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
batadv_nc_skb_store_sniffed_unicast(bat_priv, skb);
if (check < 0)
- return NET_RX_DROP;
+ goto free_skb;
if (!batadv_check_unicast_ttvn(bat_priv, skb, hdr_size))
- return NET_RX_DROP;
+ goto free_skb;
/* packet for me */
if (batadv_is_my_mac(bat_priv, unicast_packet->dest)) {
@@ -982,7 +1006,14 @@ rx_success:
return NET_RX_SUCCESS;
}
- return batadv_route_unicast_packet(skb, recv_if);
+ ret = batadv_route_unicast_packet(skb, recv_if);
+ /* skb was consumed */
+ skb = NULL;
+
+free_skb:
+ kfree_skb(skb);
+
+ return ret;
}
/**
@@ -1004,15 +1035,15 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb,
int ret = NET_RX_DROP;
if (batadv_check_unicast_packet(bat_priv, skb, hdr_size) < 0)
- return NET_RX_DROP;
+ goto free_skb;
/* the header is likely to be modified while forwarding */
if (skb_cow(skb, hdr_size) < 0)
- return NET_RX_DROP;
+ goto free_skb;
/* packet needs to be linearized to access the tvlv content */
if (skb_linearize(skb) < 0)
- return NET_RX_DROP;
+ goto free_skb;
unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)skb->data;
@@ -1020,17 +1051,21 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb,
tvlv_buff_len = ntohs(unicast_tvlv_packet->tvlv_len);
if (tvlv_buff_len > skb->len - hdr_size)
- return NET_RX_DROP;
+ goto free_skb;
ret = batadv_tvlv_containers_process(bat_priv, false, NULL,
unicast_tvlv_packet->src,
unicast_tvlv_packet->dst,
tvlv_buff, tvlv_buff_len);
- if (ret != NET_RX_SUCCESS)
+ if (ret != NET_RX_SUCCESS) {
ret = batadv_route_unicast_packet(skb, recv_if);
- else
- consume_skb(skb);
+ /* skb was consumed */
+ skb = NULL;
+ }
+
+free_skb:
+ kfree_skb(skb);
return ret;
}
@@ -1056,20 +1091,22 @@ int batadv_recv_frag_packet(struct sk_buff *skb,
if (batadv_check_unicast_packet(bat_priv, skb,
sizeof(*frag_packet)) < 0)
- goto out;
+ goto free_skb;
frag_packet = (struct batadv_frag_packet *)skb->data;
orig_node_src = batadv_orig_hash_find(bat_priv, frag_packet->orig);
if (!orig_node_src)
- goto out;
+ goto free_skb;
skb->priority = frag_packet->priority + 256;
/* Route the fragment if it is not for us and too big to be merged. */
if (!batadv_is_my_mac(bat_priv, frag_packet->dest) &&
batadv_frag_skb_fwd(skb, recv_if, orig_node_src)) {
+ /* skb was consumed */
+ skb = NULL;
ret = NET_RX_SUCCESS;
- goto out;
+ goto put_orig_node;
}
batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_RX);
@@ -1077,20 +1114,24 @@ int batadv_recv_frag_packet(struct sk_buff *skb,
/* Add fragment to buffer and merge if possible. */
if (!batadv_frag_skb_buffer(&skb, orig_node_src))
- goto out;
+ goto put_orig_node;
/* Deliver merged packet to the appropriate handler, if it was
* merged
*/
- if (skb)
+ if (skb) {
batadv_batman_skb_recv(skb, recv_if->net_dev,
&recv_if->batman_adv_ptype, NULL);
+ /* skb was consumed */
+ skb = NULL;
+ }
ret = NET_RX_SUCCESS;
-out:
- if (orig_node_src)
- batadv_orig_node_put(orig_node_src);
+put_orig_node:
+ batadv_orig_node_put(orig_node_src);
+free_skb:
+ kfree_skb(skb);
return ret;
}
@@ -1109,35 +1150,35 @@ int batadv_recv_bcast_packet(struct sk_buff *skb,
/* drop packet if it has not necessary minimum size */
if (unlikely(!pskb_may_pull(skb, hdr_size)))
- goto out;
+ goto free_skb;
ethhdr = eth_hdr(skb);
/* packet with broadcast indication but unicast recipient */
if (!is_broadcast_ether_addr(ethhdr->h_dest))
- goto out;
+ goto free_skb;
- /* packet with broadcast sender address */
- if (is_broadcast_ether_addr(ethhdr->h_source))
- goto out;
+ /* packet with broadcast/multicast sender address */
+ if (is_multicast_ether_addr(ethhdr->h_source))
+ goto free_skb;
/* ignore broadcasts sent by myself */
if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
- goto out;
+ goto free_skb;
bcast_packet = (struct batadv_bcast_packet *)skb->data;
/* ignore broadcasts originated by myself */
if (batadv_is_my_mac(bat_priv, bcast_packet->orig))
- goto out;
+ goto free_skb;
if (bcast_packet->ttl < 2)
- goto out;
+ goto free_skb;
orig_node = batadv_orig_hash_find(bat_priv, bcast_packet->orig);
if (!orig_node)
- goto out;
+ goto free_skb;
spin_lock_bh(&orig_node->bcast_seqno_lock);
@@ -1165,18 +1206,18 @@ int batadv_recv_bcast_packet(struct sk_buff *skb,
/* check whether this has been sent by another originator before */
if (batadv_bla_check_bcast_duplist(bat_priv, skb))
- goto out;
+ goto free_skb;
batadv_skb_set_priority(skb, sizeof(struct batadv_bcast_packet));
/* rebroadcast packet */
- batadv_add_bcast_packet_to_list(bat_priv, skb, 1);
+ batadv_add_bcast_packet_to_list(bat_priv, skb, 1, false);
/* don't hand the broadcast up if it is from an originator
* from the same backbone.
*/
if (batadv_bla_is_backbone_gw(skb, orig_node, hdr_size))
- goto out;
+ goto free_skb;
if (batadv_dat_snoop_incoming_arp_request(bat_priv, skb, hdr_size))
goto rx_success;
@@ -1192,6 +1233,8 @@ rx_success:
spin_unlock:
spin_unlock_bh(&orig_node->bcast_seqno_lock);
+free_skb:
+ kfree_skb(skb);
out:
if (orig_node)
batadv_orig_node_put(orig_node);
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index 05c3ff42e181..5ede16c32f15 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 8d4e1f578574..1489ec27daff 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -19,6 +19,7 @@
#include "main.h"
#include <linux/atomic.h>
+#include <linux/bug.h>
#include <linux/byteorder/generic.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
@@ -64,8 +65,11 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work);
* If neigh_node is NULL, then the packet is broadcasted using hard_iface,
* otherwise it is sent as unicast to the given neighbor.
*
- * Return: NET_TX_DROP in case of error or the result of dev_queue_xmit(skb)
- * otherwise
+ * Regardless of the return value, the skb is consumed.
+ *
+ * Return: A negative errno code is returned on a failure. A success does not
+ * guarantee the frame will be transmitted as it may be dropped due
+ * to congestion or traffic shaping.
*/
int batadv_send_skb_packet(struct sk_buff *skb,
struct batadv_hard_iface *hard_iface,
@@ -111,15 +115,9 @@ int batadv_send_skb_packet(struct sk_buff *skb,
/* dev_queue_xmit() returns a negative result on error. However on
* congestion and traffic shaping, it drops and returns NET_XMIT_DROP
* (which is > 0). This will not be treated as an error.
- *
- * a negative value cannot be returned because it could be interepreted
- * as not consumed skb by callers of batadv_send_skb_to_orig.
*/
ret = dev_queue_xmit(skb);
- if (ret < 0)
- ret = NET_XMIT_DROP;
-
- return ret;
+ return net_xmit_eval(ret);
send_skb_err:
kfree_skb(skb);
return NET_XMIT_DROP;
@@ -165,11 +163,9 @@ int batadv_send_unicast_skb(struct sk_buff *skb,
* host, NULL can be passed as recv_if and no interface alternating is
* attempted.
*
- * Return: -1 on failure (and the skb is not consumed), -EINPROGRESS if the
- * skb is buffered for later transmit or the NET_XMIT status returned by the
+ * Return: negative errno code on a failure, -EINPROGRESS if the skb is
+ * buffered for later transmit or the NET_XMIT status returned by the
* lower routine if the packet has been passed down.
- *
- * If the returning value is not -1 the skb has been consumed.
*/
int batadv_send_skb_to_orig(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
@@ -177,12 +173,14 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
{
struct batadv_priv *bat_priv = orig_node->bat_priv;
struct batadv_neigh_node *neigh_node;
- int ret = -1;
+ int ret;
/* batadv_find_router() increases neigh_nodes refcount if found. */
neigh_node = batadv_find_router(bat_priv, orig_node, recv_if);
- if (!neigh_node)
- goto out;
+ if (!neigh_node) {
+ ret = -EINVAL;
+ goto free_skb;
+ }
/* Check if the skb is too large to send in one piece and fragment
* it if needed.
@@ -191,8 +189,10 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
skb->len > neigh_node->if_incoming->net_dev->mtu) {
/* Fragment and send packet. */
ret = batadv_frag_send_packet(skb, orig_node, neigh_node);
+ /* skb was consumed */
+ skb = NULL;
- goto out;
+ goto put_neigh_node;
}
/* try to network code the packet, if it is received on an interface
@@ -204,9 +204,13 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
else
ret = batadv_send_unicast_skb(skb, neigh_node);
-out:
- if (neigh_node)
- batadv_neigh_node_put(neigh_node);
+ /* skb was consumed */
+ skb = NULL;
+
+put_neigh_node:
+ batadv_neigh_node_put(neigh_node);
+free_skb:
+ kfree_skb(skb);
return ret;
}
@@ -327,7 +331,7 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
{
struct batadv_unicast_packet *unicast_packet;
struct ethhdr *ethhdr;
- int res, ret = NET_XMIT_DROP;
+ int ret = NET_XMIT_DROP;
if (!orig_node)
goto out;
@@ -364,13 +368,12 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
if (batadv_tt_global_client_is_roaming(bat_priv, ethhdr->h_dest, vid))
unicast_packet->ttvn = unicast_packet->ttvn - 1;
- res = batadv_send_skb_to_orig(skb, orig_node, NULL);
- if (res != -1)
- ret = NET_XMIT_SUCCESS;
+ ret = batadv_send_skb_to_orig(skb, orig_node, NULL);
+ /* skb was consumed */
+ skb = NULL;
out:
- if (ret == NET_XMIT_DROP)
- kfree_skb(skb);
+ kfree_skb(skb);
return ret;
}
@@ -451,13 +454,19 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
/**
* batadv_forw_packet_free - free a forwarding packet
* @forw_packet: The packet to free
+ * @dropped: whether the packet is freed because is is dropped
*
* This frees a forwarding packet and releases any resources it might
* have claimed.
*/
-void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet)
+void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet,
+ bool dropped)
{
- kfree_skb(forw_packet->skb);
+ if (dropped)
+ kfree_skb(forw_packet->skb);
+ else
+ consume_skb(forw_packet->skb);
+
if (forw_packet->if_incoming)
batadv_hardif_put(forw_packet->if_incoming);
if (forw_packet->if_outgoing)
@@ -514,6 +523,8 @@ batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming,
if (if_outgoing)
kref_get(&if_outgoing->refcount);
+ INIT_HLIST_NODE(&forw_packet->list);
+ INIT_HLIST_NODE(&forw_packet->cleanup_list);
forw_packet->skb = NULL;
forw_packet->queue_left = queue_left;
forw_packet->if_incoming = if_incoming;
@@ -529,19 +540,191 @@ err:
return NULL;
}
+/**
+ * batadv_forw_packet_was_stolen - check whether someone stole this packet
+ * @forw_packet: the forwarding packet to check
+ *
+ * This function checks whether the given forwarding packet was claimed by
+ * someone else for free().
+ *
+ * Return: True if someone stole it, false otherwise.
+ */
+static bool
+batadv_forw_packet_was_stolen(struct batadv_forw_packet *forw_packet)
+{
+ return !hlist_unhashed(&forw_packet->cleanup_list);
+}
+
+/**
+ * batadv_forw_packet_steal - claim a forw_packet for free()
+ * @forw_packet: the forwarding packet to steal
+ * @lock: a key to the store to steal from (e.g. forw_{bat,bcast}_list_lock)
+ *
+ * This function tries to steal a specific forw_packet from global
+ * visibility for the purpose of getting it for free(). That means
+ * the caller is *not* allowed to requeue it afterwards.
+ *
+ * Return: True if stealing was successful. False if someone else stole it
+ * before us.
+ */
+bool batadv_forw_packet_steal(struct batadv_forw_packet *forw_packet,
+ spinlock_t *lock)
+{
+ /* did purging routine steal it earlier? */
+ spin_lock_bh(lock);
+ if (batadv_forw_packet_was_stolen(forw_packet)) {
+ spin_unlock_bh(lock);
+ return false;
+ }
+
+ hlist_del_init(&forw_packet->list);
+
+ /* Just to spot misuse of this function */
+ hlist_add_fake(&forw_packet->cleanup_list);
+
+ spin_unlock_bh(lock);
+ return true;
+}
+
+/**
+ * batadv_forw_packet_list_steal - claim a list of forward packets for free()
+ * @forw_list: the to be stolen forward packets
+ * @cleanup_list: a backup pointer, to be able to dispose the packet later
+ * @hard_iface: the interface to steal forward packets from
+ *
+ * This function claims responsibility to free any forw_packet queued on the
+ * given hard_iface. If hard_iface is NULL forwarding packets on all hard
+ * interfaces will be claimed.
+ *
+ * The packets are being moved from the forw_list to the cleanup_list and
+ * by that allows already running threads to notice the claiming.
+ */
static void
-_batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
- struct batadv_forw_packet *forw_packet,
- unsigned long send_time)
+batadv_forw_packet_list_steal(struct hlist_head *forw_list,
+ struct hlist_head *cleanup_list,
+ const struct batadv_hard_iface *hard_iface)
{
- /* add new packet to packet list */
- spin_lock_bh(&bat_priv->forw_bcast_list_lock);
- hlist_add_head(&forw_packet->list, &bat_priv->forw_bcast_list);
- spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
+ struct batadv_forw_packet *forw_packet;
+ struct hlist_node *safe_tmp_node;
+
+ hlist_for_each_entry_safe(forw_packet, safe_tmp_node,
+ forw_list, list) {
+ /* if purge_outstanding_packets() was called with an argument
+ * we delete only packets belonging to the given interface
+ */
+ if (hard_iface &&
+ (forw_packet->if_incoming != hard_iface) &&
+ (forw_packet->if_outgoing != hard_iface))
+ continue;
+
+ hlist_del(&forw_packet->list);
+ hlist_add_head(&forw_packet->cleanup_list, cleanup_list);
+ }
+}
+
+/**
+ * batadv_forw_packet_list_free - free a list of forward packets
+ * @head: a list of to be freed forw_packets
+ *
+ * This function cancels the scheduling of any packet in the provided list,
+ * waits for any possibly running packet forwarding thread to finish and
+ * finally, safely frees this forward packet.
+ *
+ * This function might sleep.
+ */
+static void batadv_forw_packet_list_free(struct hlist_head *head)
+{
+ struct batadv_forw_packet *forw_packet;
+ struct hlist_node *safe_tmp_node;
- /* start timer for this packet */
- queue_delayed_work(batadv_event_workqueue, &forw_packet->delayed_work,
- send_time);
+ hlist_for_each_entry_safe(forw_packet, safe_tmp_node, head,
+ cleanup_list) {
+ cancel_delayed_work_sync(&forw_packet->delayed_work);
+
+ hlist_del(&forw_packet->cleanup_list);
+ batadv_forw_packet_free(forw_packet, true);
+ }
+}
+
+/**
+ * batadv_forw_packet_queue - try to queue a forwarding packet
+ * @forw_packet: the forwarding packet to queue
+ * @lock: a key to the store (e.g. forw_{bat,bcast}_list_lock)
+ * @head: the shelve to queue it on (e.g. forw_{bat,bcast}_list)
+ * @send_time: timestamp (jiffies) when the packet is to be sent
+ *
+ * This function tries to (re)queue a forwarding packet. Requeuing
+ * is prevented if the according interface is shutting down
+ * (e.g. if batadv_forw_packet_list_steal() was called for this
+ * packet earlier).
+ *
+ * Calling batadv_forw_packet_queue() after a call to
+ * batadv_forw_packet_steal() is forbidden!
+ *
+ * Caller needs to ensure that forw_packet->delayed_work was initialized.
+ */
+static void batadv_forw_packet_queue(struct batadv_forw_packet *forw_packet,
+ spinlock_t *lock, struct hlist_head *head,
+ unsigned long send_time)
+{
+ spin_lock_bh(lock);
+
+ /* did purging routine steal it from us? */
+ if (batadv_forw_packet_was_stolen(forw_packet)) {
+ /* If you got it for free() without trouble, then
+ * don't get back into the queue after stealing...
+ */
+ WARN_ONCE(hlist_fake(&forw_packet->cleanup_list),
+ "Requeuing after batadv_forw_packet_steal() not allowed!\n");
+
+ spin_unlock_bh(lock);
+ return;
+ }
+
+ hlist_del_init(&forw_packet->list);
+ hlist_add_head(&forw_packet->list, head);
+
+ queue_delayed_work(batadv_event_workqueue,
+ &forw_packet->delayed_work,
+ send_time - jiffies);
+ spin_unlock_bh(lock);
+}
+
+/**
+ * batadv_forw_packet_bcast_queue - try to queue a broadcast packet
+ * @bat_priv: the bat priv with all the soft interface information
+ * @forw_packet: the forwarding packet to queue
+ * @send_time: timestamp (jiffies) when the packet is to be sent
+ *
+ * This function tries to (re)queue a broadcast packet.
+ *
+ * Caller needs to ensure that forw_packet->delayed_work was initialized.
+ */
+static void
+batadv_forw_packet_bcast_queue(struct batadv_priv *bat_priv,
+ struct batadv_forw_packet *forw_packet,
+ unsigned long send_time)
+{
+ batadv_forw_packet_queue(forw_packet, &bat_priv->forw_bcast_list_lock,
+ &bat_priv->forw_bcast_list, send_time);
+}
+
+/**
+ * batadv_forw_packet_ogmv1_queue - try to queue an OGMv1 packet
+ * @bat_priv: the bat priv with all the soft interface information
+ * @forw_packet: the forwarding packet to queue
+ * @send_time: timestamp (jiffies) when the packet is to be sent
+ *
+ * This function tries to (re)queue an OGMv1 packet.
+ *
+ * Caller needs to ensure that forw_packet->delayed_work was initialized.
+ */
+void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv,
+ struct batadv_forw_packet *forw_packet,
+ unsigned long send_time)
+{
+ batadv_forw_packet_queue(forw_packet, &bat_priv->forw_bat_list_lock,
+ &bat_priv->forw_bat_list, send_time);
}
/**
@@ -549,6 +732,7 @@ _batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
* @bat_priv: the bat priv with all the soft interface information
* @skb: broadcast packet to add
* @delay: number of jiffies to wait before sending
+ * @own_packet: true if it is a self-generated broadcast packet
*
* add a broadcast packet to the queue and setup timers. broadcast packets
* are sent multiple times to increase probability for being received.
@@ -560,9 +744,10 @@ _batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
*/
int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
const struct sk_buff *skb,
- unsigned long delay)
+ unsigned long delay,
+ bool own_packet)
{
- struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_hard_iface *primary_if;
struct batadv_forw_packet *forw_packet;
struct batadv_bcast_packet *bcast_packet;
struct sk_buff *newskb;
@@ -586,18 +771,17 @@ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
bcast_packet = (struct batadv_bcast_packet *)newskb->data;
bcast_packet->ttl--;
- skb_reset_mac_header(newskb);
-
forw_packet->skb = newskb;
+ forw_packet->own = own_packet;
INIT_DELAYED_WORK(&forw_packet->delayed_work,
batadv_send_outstanding_bcast_packet);
- _batadv_add_bcast_packet_to_list(bat_priv, forw_packet, delay);
+ batadv_forw_packet_bcast_queue(bat_priv, forw_packet, jiffies + delay);
return NETDEV_TX_OK;
err_packet_free:
- batadv_forw_packet_free(forw_packet);
+ batadv_forw_packet_free(forw_packet, true);
err:
return NETDEV_TX_BUSY;
}
@@ -605,11 +789,18 @@ err:
static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
{
struct batadv_hard_iface *hard_iface;
+ struct batadv_hardif_neigh_node *neigh_node;
struct delayed_work *delayed_work;
struct batadv_forw_packet *forw_packet;
+ struct batadv_bcast_packet *bcast_packet;
struct sk_buff *skb1;
struct net_device *soft_iface;
struct batadv_priv *bat_priv;
+ unsigned long send_time = jiffies + msecs_to_jiffies(5);
+ bool dropped = false;
+ u8 *neigh_addr;
+ u8 *orig_neigh;
+ int ret = 0;
delayed_work = to_delayed_work(work);
forw_packet = container_of(delayed_work, struct batadv_forw_packet,
@@ -617,15 +808,17 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
soft_iface = forw_packet->if_incoming->soft_iface;
bat_priv = netdev_priv(soft_iface);
- spin_lock_bh(&bat_priv->forw_bcast_list_lock);
- hlist_del(&forw_packet->list);
- spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
-
- if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) {
+ dropped = true;
goto out;
+ }
- if (batadv_dat_drop_broadcast_packet(bat_priv, forw_packet))
+ if (batadv_dat_drop_broadcast_packet(bat_priv, forw_packet)) {
+ dropped = true;
goto out;
+ }
+
+ bcast_packet = (struct batadv_bcast_packet *)forw_packet->skb->data;
/* rebroadcast packet */
rcu_read_lock();
@@ -636,6 +829,49 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
if (forw_packet->num_packets >= hard_iface->num_bcasts)
continue;
+ if (forw_packet->own) {
+ neigh_node = NULL;
+ } else {
+ neigh_addr = eth_hdr(forw_packet->skb)->h_source;
+ neigh_node = batadv_hardif_neigh_get(hard_iface,
+ neigh_addr);
+ }
+
+ orig_neigh = neigh_node ? neigh_node->orig : NULL;
+
+ ret = batadv_hardif_no_broadcast(hard_iface, bcast_packet->orig,
+ orig_neigh);
+
+ if (ret) {
+ char *type;
+
+ switch (ret) {
+ case BATADV_HARDIF_BCAST_NORECIPIENT:
+ type = "no neighbor";
+ break;
+ case BATADV_HARDIF_BCAST_DUPFWD:
+ type = "single neighbor is source";
+ break;
+ case BATADV_HARDIF_BCAST_DUPORIG:
+ type = "single neighbor is originator";
+ break;
+ default:
+ type = "unknown";
+ }
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "BCAST packet from orig %pM on %s surpressed: %s\n",
+ bcast_packet->orig,
+ hard_iface->net_dev->name, type);
+
+ if (neigh_node)
+ batadv_hardif_neigh_put(neigh_node);
+
+ continue;
+ }
+
+ if (neigh_node)
+ batadv_hardif_neigh_put(neigh_node);
+
if (!kref_get_unless_zero(&hard_iface->refcount))
continue;
@@ -652,22 +888,34 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
/* if we still have some more bcasts to send */
if (forw_packet->num_packets < BATADV_NUM_BCASTS_MAX) {
- _batadv_add_bcast_packet_to_list(bat_priv, forw_packet,
- msecs_to_jiffies(5));
+ batadv_forw_packet_bcast_queue(bat_priv, forw_packet,
+ send_time);
return;
}
out:
- batadv_forw_packet_free(forw_packet);
+ /* do we get something for free()? */
+ if (batadv_forw_packet_steal(forw_packet,
+ &bat_priv->forw_bcast_list_lock))
+ batadv_forw_packet_free(forw_packet, dropped);
}
+/**
+ * batadv_purge_outstanding_packets - stop/purge scheduled bcast/OGMv1 packets
+ * @bat_priv: the bat priv with all the soft interface information
+ * @hard_iface: the hard interface to cancel and purge bcast/ogm packets on
+ *
+ * This method cancels and purges any broadcast and OGMv1 packet on the given
+ * hard_iface. If hard_iface is NULL, broadcast and OGMv1 packets on all hard
+ * interfaces will be canceled and purged.
+ *
+ * This function might sleep.
+ */
void
batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
const struct batadv_hard_iface *hard_iface)
{
- struct batadv_forw_packet *forw_packet;
- struct hlist_node *safe_tmp_node;
- bool pending;
+ struct hlist_head head = HLIST_HEAD_INIT;
if (hard_iface)
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
@@ -677,57 +925,18 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"purge_outstanding_packets()\n");
- /* free bcast list */
+ /* claim bcast list for free() */
spin_lock_bh(&bat_priv->forw_bcast_list_lock);
- hlist_for_each_entry_safe(forw_packet, safe_tmp_node,
- &bat_priv->forw_bcast_list, list) {
- /* if purge_outstanding_packets() was called with an argument
- * we delete only packets belonging to the given interface
- */
- if ((hard_iface) &&
- (forw_packet->if_incoming != hard_iface) &&
- (forw_packet->if_outgoing != hard_iface))
- continue;
-
- spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
-
- /* batadv_send_outstanding_bcast_packet() will lock the list to
- * delete the item from the list
- */
- pending = cancel_delayed_work_sync(&forw_packet->delayed_work);
- spin_lock_bh(&bat_priv->forw_bcast_list_lock);
-
- if (pending) {
- hlist_del(&forw_packet->list);
- batadv_forw_packet_free(forw_packet);
- }
- }
+ batadv_forw_packet_list_steal(&bat_priv->forw_bcast_list, &head,
+ hard_iface);
spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
- /* free batman packet list */
+ /* claim batman packet list for free() */
spin_lock_bh(&bat_priv->forw_bat_list_lock);
- hlist_for_each_entry_safe(forw_packet, safe_tmp_node,
- &bat_priv->forw_bat_list, list) {
- /* if purge_outstanding_packets() was called with an argument
- * we delete only packets belonging to the given interface
- */
- if ((hard_iface) &&
- (forw_packet->if_incoming != hard_iface) &&
- (forw_packet->if_outgoing != hard_iface))
- continue;
-
- spin_unlock_bh(&bat_priv->forw_bat_list_lock);
-
- /* send_outstanding_bat_packet() will lock the list to
- * delete the item from the list
- */
- pending = cancel_delayed_work_sync(&forw_packet->delayed_work);
- spin_lock_bh(&bat_priv->forw_bat_list_lock);
-
- if (pending) {
- hlist_del(&forw_packet->list);
- batadv_forw_packet_free(forw_packet);
- }
- }
+ batadv_forw_packet_list_steal(&bat_priv->forw_bat_list, &head,
+ hard_iface);
spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+
+ /* then cancel or wait for packet workers to finish and free */
+ batadv_forw_packet_list_free(&head);
}
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index 999f78683d9e..f21166d10323 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -21,18 +21,24 @@
#include "main.h"
#include <linux/compiler.h>
+#include <linux/spinlock.h>
#include <linux/types.h>
#include "packet.h"
struct sk_buff;
-void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet);
+void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet,
+ bool dropped);
struct batadv_forw_packet *
batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming,
struct batadv_hard_iface *if_outgoing,
atomic_t *queue_left,
struct batadv_priv *bat_priv);
+bool batadv_forw_packet_steal(struct batadv_forw_packet *packet, spinlock_t *l);
+void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv,
+ struct batadv_forw_packet *forw_packet,
+ unsigned long send_time);
int batadv_send_skb_to_orig(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
@@ -46,7 +52,8 @@ int batadv_send_unicast_skb(struct sk_buff *skb,
struct batadv_neigh_node *neigh_node);
int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
const struct sk_buff *skb,
- unsigned long delay);
+ unsigned long delay,
+ bool own_packet);
void
batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
const struct batadv_hard_iface *hard_iface);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 49e16b6e0ba3..d042c99af028 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -22,6 +22,7 @@
#include <linux/byteorder/generic.h>
#include <linux/cache.h>
#include <linux/compiler.h>
+#include <linux/cpumask.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
@@ -116,6 +117,26 @@ static int batadv_interface_release(struct net_device *dev)
return 0;
}
+/**
+ * batadv_sum_counter - Sum the cpu-local counters for index 'idx'
+ * @bat_priv: the bat priv with all the soft interface information
+ * @idx: index of counter to sum up
+ *
+ * Return: sum of all cpu-local counters
+ */
+static u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx)
+{
+ u64 *counters, sum = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ counters = per_cpu_ptr(bat_priv->bat_counters, cpu);
+ sum += counters[idx];
+ }
+
+ return sum;
+}
+
static struct net_device_stats *batadv_interface_stats(struct net_device *dev)
{
struct batadv_priv *bat_priv = netdev_priv(dev);
@@ -237,7 +258,8 @@ static int batadv_interface_tx(struct sk_buff *skb,
ethhdr = eth_hdr(skb);
/* Register the client MAC in the transtable */
- if (!is_multicast_ether_addr(ethhdr->h_source)) {
+ if (!is_multicast_ether_addr(ethhdr->h_source) &&
+ !batadv_bla_is_loopdetect_mac(ethhdr->h_source)) {
client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source,
vid, skb->skb_iif,
skb->mark);
@@ -336,12 +358,12 @@ send:
seqno = atomic_inc_return(&bat_priv->bcast_seqno);
bcast_packet->seqno = htonl(seqno);
- batadv_add_bcast_packet_to_list(bat_priv, skb, brd_delay);
+ batadv_add_bcast_packet_to_list(bat_priv, skb, brd_delay, true);
/* a copy is stored in the bcast list, therefore removing
* the original skb.
*/
- kfree_skb(skb);
+ consume_skb(skb);
/* unicast packet */
} else {
@@ -365,7 +387,7 @@ send:
ret = batadv_send_skb_via_tt(bat_priv, skb, dst_hint,
vid);
}
- if (ret == NET_XMIT_DROP)
+ if (ret != NET_XMIT_SUCCESS)
goto dropped_freed;
}
@@ -460,8 +482,6 @@ void batadv_interface_rx(struct net_device *soft_iface,
batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
skb->len + ETH_HLEN);
- soft_iface->last_rx = jiffies;
-
/* Let the bridge loop avoidance check the packet. If will
* not handle it, we can safely push it up.
*/
@@ -799,7 +819,6 @@ static int batadv_softif_init_late(struct net_device *dev)
atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0);
#endif
atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF);
- atomic_set(&bat_priv->gw.sel_class, 20);
atomic_set(&bat_priv->gw.bandwidth_down, 100);
atomic_set(&bat_priv->gw.bandwidth_up, 20);
atomic_set(&bat_priv->orig_interval, 1000);
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index ec303ddbf647..639c3abb214a 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 02d96f224c60..0ae8b30e4eaa 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -33,7 +33,6 @@
#include <linux/rcupdate.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
-#include <linux/stat.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/stringify.h>
@@ -666,41 +665,36 @@ static ssize_t batadv_store_isolation_mark(struct kobject *kobj,
return count;
}
-BATADV_ATTR_SIF_BOOL(aggregated_ogms, S_IRUGO | S_IWUSR, NULL);
-BATADV_ATTR_SIF_BOOL(bonding, S_IRUGO | S_IWUSR, NULL);
+BATADV_ATTR_SIF_BOOL(aggregated_ogms, 0644, NULL);
+BATADV_ATTR_SIF_BOOL(bonding, 0644, NULL);
#ifdef CONFIG_BATMAN_ADV_BLA
-BATADV_ATTR_SIF_BOOL(bridge_loop_avoidance, S_IRUGO | S_IWUSR,
- batadv_bla_status_update);
+BATADV_ATTR_SIF_BOOL(bridge_loop_avoidance, 0644, batadv_bla_status_update);
#endif
#ifdef CONFIG_BATMAN_ADV_DAT
-BATADV_ATTR_SIF_BOOL(distributed_arp_table, S_IRUGO | S_IWUSR,
- batadv_dat_status_update);
+BATADV_ATTR_SIF_BOOL(distributed_arp_table, 0644, batadv_dat_status_update);
#endif
-BATADV_ATTR_SIF_BOOL(fragmentation, S_IRUGO | S_IWUSR, batadv_update_min_mtu);
-static BATADV_ATTR(routing_algo, S_IRUGO, batadv_show_bat_algo, NULL);
-static BATADV_ATTR(gw_mode, S_IRUGO | S_IWUSR, batadv_show_gw_mode,
- batadv_store_gw_mode);
-BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, S_IRUGO | S_IWUSR,
- 2 * BATADV_JITTER, INT_MAX, NULL);
-BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, S_IRUGO | S_IWUSR, 0,
- BATADV_TQ_MAX_VALUE, NULL);
-static BATADV_ATTR(gw_sel_class, S_IRUGO | S_IWUSR, batadv_show_gw_sel_class,
+BATADV_ATTR_SIF_BOOL(fragmentation, 0644, batadv_update_min_mtu);
+static BATADV_ATTR(routing_algo, 0444, batadv_show_bat_algo, NULL);
+static BATADV_ATTR(gw_mode, 0644, batadv_show_gw_mode, batadv_store_gw_mode);
+BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, 0644, 2 * BATADV_JITTER,
+ INT_MAX, NULL);
+BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, 0644, 0, BATADV_TQ_MAX_VALUE,
+ NULL);
+static BATADV_ATTR(gw_sel_class, 0644, batadv_show_gw_sel_class,
batadv_store_gw_sel_class);
-static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth,
+static BATADV_ATTR(gw_bandwidth, 0644, batadv_show_gw_bwidth,
batadv_store_gw_bwidth);
#ifdef CONFIG_BATMAN_ADV_MCAST
-BATADV_ATTR_SIF_BOOL(multicast_mode, S_IRUGO | S_IWUSR, NULL);
+BATADV_ATTR_SIF_BOOL(multicast_mode, 0644, NULL);
#endif
#ifdef CONFIG_BATMAN_ADV_DEBUG
-BATADV_ATTR_SIF_UINT(log_level, log_level, S_IRUGO | S_IWUSR, 0,
- BATADV_DBG_ALL, NULL);
+BATADV_ATTR_SIF_UINT(log_level, log_level, 0644, 0, BATADV_DBG_ALL, NULL);
#endif
#ifdef CONFIG_BATMAN_ADV_NC
-BATADV_ATTR_SIF_BOOL(network_coding, S_IRUGO | S_IWUSR,
- batadv_nc_status_update);
+BATADV_ATTR_SIF_BOOL(network_coding, 0644, batadv_nc_status_update);
#endif
-static BATADV_ATTR(isolation_mark, S_IRUGO | S_IWUSR,
- batadv_show_isolation_mark, batadv_store_isolation_mark);
+static BATADV_ATTR(isolation_mark, 0644, batadv_show_isolation_mark,
+ batadv_store_isolation_mark);
static struct batadv_attribute *batadv_mesh_attrs[] = {
&batadv_attr_aggregated_ogms,
@@ -731,7 +725,7 @@ static struct batadv_attribute *batadv_mesh_attrs[] = {
NULL,
};
-BATADV_ATTR_VLAN_BOOL(ap_isolation, S_IRUGO | S_IWUSR, NULL);
+BATADV_ATTR_VLAN_BOOL(ap_isolation, 0644, NULL);
/* array of vlan specific sysfs attributes */
static struct batadv_attribute *batadv_vlan_attrs[] = {
@@ -1116,14 +1110,13 @@ static ssize_t batadv_show_throughput_override(struct kobject *kobj,
#endif
-static BATADV_ATTR(mesh_iface, S_IRUGO | S_IWUSR, batadv_show_mesh_iface,
+static BATADV_ATTR(mesh_iface, 0644, batadv_show_mesh_iface,
batadv_store_mesh_iface);
-static BATADV_ATTR(iface_status, S_IRUGO, batadv_show_iface_status, NULL);
+static BATADV_ATTR(iface_status, 0444, batadv_show_iface_status, NULL);
#ifdef CONFIG_BATMAN_ADV_BATMAN_V
-BATADV_ATTR_HIF_UINT(elp_interval, bat_v.elp_interval, S_IRUGO | S_IWUSR,
+BATADV_ATTR_HIF_UINT(elp_interval, bat_v.elp_interval, 0644,
2 * BATADV_JITTER, INT_MAX, NULL);
-static BATADV_ATTR(throughput_override, S_IRUGO | S_IWUSR,
- batadv_show_throughput_override,
+static BATADV_ATTR(throughput_override, 0644, batadv_show_throughput_override,
batadv_store_throughput_override);
#endif
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
index c76021b4e198..e487412e256b 100644
--- a/net/batman-adv/sysfs.h
+++ b/net/batman-adv/sysfs.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 8af1611b8ab2..c94ebdecdc3d 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors:
*
* Edo Monticelli, Antonio Quartulli
*
@@ -23,7 +23,7 @@
#include <linux/byteorder/generic.h>
#include <linux/cache.h>
#include <linux/compiler.h>
-#include <linux/device.h>
+#include <linux/err.h>
#include <linux/etherdevice.h>
#include <linux/fs.h>
#include <linux/if_ether.h>
@@ -615,9 +615,6 @@ static int batadv_tp_send_msg(struct batadv_tp_vars *tp_vars, const u8 *src,
batadv_tp_fill_prerandom(tp_vars, data, data_len);
r = batadv_send_skb_to_orig(skb, orig_node, NULL);
- if (r == -1)
- kfree_skb(skb);
-
if (r == NET_XMIT_SUCCESS)
return 0;
@@ -1207,9 +1204,6 @@ static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst,
/* send the ack */
r = batadv_send_skb_to_orig(skb, orig_node, NULL);
- if (r == -1)
- kfree_skb(skb);
-
if (unlikely(r < 0) || (r == NET_XMIT_DROP)) {
ret = BATADV_TP_REASON_DST_UNREACHABLE;
goto out;
diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h
index ba922c425e56..a8ada5c123bd 100644
--- a/net/batman-adv/tp_meter.h
+++ b/net/batman-adv/tp_meter.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors:
*
* Edo Monticelli, Antonio Quartulli
*
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 0dc85eb1cb7a..6077a87d46f0 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich, Antonio Quartulli
*
@@ -56,7 +56,6 @@
#include "hard-interface.h"
#include "hash.h"
#include "log.h"
-#include "multicast.h"
#include "netlink.h"
#include "originator.h"
#include "packet.h"
@@ -647,6 +646,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
struct net *net = dev_net(soft_iface);
struct batadv_softif_vlan *vlan;
struct net_device *in_dev = NULL;
+ struct batadv_hard_iface *in_hardif = NULL;
struct hlist_head *head;
struct batadv_tt_orig_list_entry *orig_entry;
int hash_added, table_size, packet_size_max;
@@ -658,6 +658,9 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
if (ifindex != BATADV_NULL_IFINDEX)
in_dev = dev_get_by_index(net, ifindex);
+ if (in_dev)
+ in_hardif = batadv_hardif_get_by_netdev(in_dev);
+
tt_local = batadv_tt_local_hash_find(bat_priv, addr, vid);
if (!is_multicast_ether_addr(addr))
@@ -731,7 +734,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
*/
tt_local->common.flags = BATADV_TT_CLIENT_NEW;
tt_local->common.vid = vid;
- if (batadv_is_wifi_netdev(in_dev))
+ if (batadv_is_wifi_hardif(in_hardif))
tt_local->common.flags |= BATADV_TT_CLIENT_WIFI;
kref_init(&tt_local->common.refcount);
tt_local->last_seen = jiffies;
@@ -791,7 +794,7 @@ check_roaming:
*/
remote_flags = tt_local->common.flags & BATADV_TT_REMOTE_MASK;
- if (batadv_is_wifi_netdev(in_dev))
+ if (batadv_is_wifi_hardif(in_hardif))
tt_local->common.flags |= BATADV_TT_CLIENT_WIFI;
else
tt_local->common.flags &= ~BATADV_TT_CLIENT_WIFI;
@@ -815,6 +818,8 @@ check_roaming:
ret = true;
out:
+ if (in_hardif)
+ batadv_hardif_put(in_hardif);
if (in_dev)
dev_put(in_dev);
if (tt_local)
@@ -3709,7 +3714,6 @@ static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags,
{
struct batadv_hashtable *hash = bat_priv->tt.local_hash;
struct batadv_tt_common_entry *tt_common_entry;
- u16 changed_num = 0;
struct hlist_head *head;
u32 i;
@@ -3731,7 +3735,6 @@ static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags,
continue;
tt_common_entry->flags &= ~flags;
}
- changed_num++;
if (!count)
continue;
@@ -3795,9 +3798,6 @@ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv)
{
lockdep_assert_held(&bat_priv->tt.commit_lock);
- /* Update multicast addresses in local translation table */
- batadv_mcast_mla_update(bat_priv);
-
if (atomic_read(&bat_priv->tt.local_changes) < 1) {
if (!batadv_atomic_dec_not_zero(&bat_priv->tt.ogm_append_cnt))
batadv_tt_tvlv_container_update(bat_priv);
@@ -3835,8 +3835,8 @@ void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv)
bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst,
unsigned short vid)
{
- struct batadv_tt_local_entry *tt_local_entry = NULL;
- struct batadv_tt_global_entry *tt_global_entry = NULL;
+ struct batadv_tt_local_entry *tt_local_entry;
+ struct batadv_tt_global_entry *tt_global_entry;
struct batadv_softif_vlan *vlan;
bool ret = false;
@@ -3845,27 +3845,24 @@ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst,
return false;
if (!atomic_read(&vlan->ap_isolation))
- goto out;
+ goto vlan_put;
tt_local_entry = batadv_tt_local_hash_find(bat_priv, dst, vid);
if (!tt_local_entry)
- goto out;
+ goto vlan_put;
tt_global_entry = batadv_tt_global_hash_find(bat_priv, src, vid);
if (!tt_global_entry)
- goto out;
+ goto local_entry_put;
- if (!_batadv_is_ap_isolated(tt_local_entry, tt_global_entry))
- goto out;
-
- ret = true;
+ if (_batadv_is_ap_isolated(tt_local_entry, tt_global_entry))
+ ret = true;
-out:
+ batadv_tt_global_entry_put(tt_global_entry);
+local_entry_put:
+ batadv_tt_local_entry_put(tt_local_entry);
+vlan_put:
batadv_softif_vlan_put(vlan);
- if (tt_global_entry)
- batadv_tt_global_entry_put(tt_global_entry);
- if (tt_local_entry)
- batadv_tt_local_entry_put(tt_local_entry);
return ret;
}
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index 783fdba84db2..411d586191da 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich, Antonio Quartulli
*
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index 77654f055f24..1d9e267caec9 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -600,7 +600,6 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src,
unsigned char *tvlv_buff;
unsigned int tvlv_len;
ssize_t hdr_len = sizeof(*unicast_tvlv_packet);
- int res;
orig_node = batadv_orig_hash_find(bat_priv, dst);
if (!orig_node)
@@ -633,9 +632,7 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src,
tvlv_buff += sizeof(*tvlv_hdr);
memcpy(tvlv_buff, tvlv_value, tvlv_value_len);
- res = batadv_send_skb_to_orig(skb, orig_node, NULL);
- if (res == -1)
- kfree_skb(skb);
+ batadv_send_skb_to_orig(skb, orig_node, NULL);
out:
batadv_orig_node_put(orig_node);
}
diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h
index e4369b547b43..4d01400ada30 100644
--- a/net/batman-adv/tvlv.h
+++ b/net/batman-adv/tvlv.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index b3dd1a381aad..246f21b4973b 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -119,12 +119,28 @@ struct batadv_hard_iface_bat_v {
};
/**
+ * enum batadv_hard_iface_wifi_flags - Flags describing the wifi configuration
+ * of a batadv_hard_iface
+ * @BATADV_HARDIF_WIFI_WEXT_DIRECT: it is a wext wifi device
+ * @BATADV_HARDIF_WIFI_CFG80211_DIRECT: it is a cfg80211 wifi device
+ * @BATADV_HARDIF_WIFI_WEXT_INDIRECT: link device is a wext wifi device
+ * @BATADV_HARDIF_WIFI_CFG80211_INDIRECT: link device is a cfg80211 wifi device
+ */
+enum batadv_hard_iface_wifi_flags {
+ BATADV_HARDIF_WIFI_WEXT_DIRECT = BIT(0),
+ BATADV_HARDIF_WIFI_CFG80211_DIRECT = BIT(1),
+ BATADV_HARDIF_WIFI_WEXT_INDIRECT = BIT(2),
+ BATADV_HARDIF_WIFI_CFG80211_INDIRECT = BIT(3),
+};
+
+/**
* struct batadv_hard_iface - network device known to batman-adv
* @list: list node for batadv_hardif_list
* @if_num: identificator of the interface
* @if_status: status of the interface for batman-adv
- * @net_dev: pointer to the net_device
* @num_bcasts: number of payload re-broadcasts on this interface (ARQ)
+ * @wifi_flags: flags whether this is (directly or indirectly) a wifi interface
+ * @net_dev: pointer to the net_device
* @hardif_obj: kobject of the per interface sysfs "mesh" directory
* @refcount: number of contexts the object is used
* @batman_adv_ptype: packet type describing packets that should be processed by
@@ -141,8 +157,9 @@ struct batadv_hard_iface {
struct list_head list;
s16 if_num;
char if_status;
- struct net_device *net_dev;
u8 num_bcasts;
+ u32 wifi_flags;
+ struct net_device *net_dev;
struct kobject *hardif_obj;
struct kref refcount;
struct packet_type batman_adv_ptype;
@@ -184,7 +201,7 @@ struct batadv_orig_ifinfo {
/**
* struct batadv_frag_table_entry - head in the fragment buffer table
- * @head: head of list with fragments
+ * @fragment_list: head of list with fragments
* @lock: lock to protect the list of fragments
* @timestamp: time (jiffie) of last received fragment
* @seqno: sequence number of the fragments in the list
@@ -192,8 +209,8 @@ struct batadv_orig_ifinfo {
* @total_size: expected size of the assembled packet
*/
struct batadv_frag_table_entry {
- struct hlist_head head;
- spinlock_t lock; /* protects head */
+ struct hlist_head fragment_list;
+ spinlock_t lock; /* protects fragment_list */
unsigned long timestamp;
u16 seqno;
u16 size;
@@ -385,7 +402,7 @@ struct batadv_gw_node {
struct rcu_head rcu;
};
-DECLARE_EWMA(throughput, 1024, 8)
+DECLARE_EWMA(throughput, 10, 8)
/**
* struct batadv_hardif_neigh_node_bat_v - B.A.T.M.A.N. V private neighbor
@@ -408,6 +425,7 @@ struct batadv_hardif_neigh_node_bat_v {
* struct batadv_hardif_neigh_node - unique neighbor per hard-interface
* @list: list node for batadv_hard_iface::neigh_list
* @addr: the MAC address of the neighboring interface
+ * @orig: the address of the originator this neighbor node belongs to
* @if_incoming: pointer to incoming hard-interface
* @last_seen: when last packet via this neighbor was received
* @bat_v: B.A.T.M.A.N. V private data
@@ -417,6 +435,7 @@ struct batadv_hardif_neigh_node_bat_v {
struct batadv_hardif_neigh_node {
struct hlist_node list;
u8 addr[ETH_ALEN];
+ u8 orig[ETH_ALEN];
struct batadv_hard_iface *if_incoming;
unsigned long last_seen;
#ifdef CONFIG_BATMAN_ADV_BATMAN_V
@@ -706,8 +725,8 @@ struct batadv_priv_debug_log {
/**
* struct batadv_priv_gw - per mesh interface gateway data
- * @list: list of available gateway nodes
- * @list_lock: lock protecting gw_list & curr_gw
+ * @gateway_list: list of available gateway nodes
+ * @list_lock: lock protecting gateway_list & curr_gw
* @curr_gw: pointer to currently selected gateway node
* @mode: gateway operation: off, client or server (see batadv_gw_modes)
* @sel_class: gateway selection class (applies if gw_mode client)
@@ -716,8 +735,8 @@ struct batadv_priv_debug_log {
* @reselect: bool indicating a gateway re-selection is in progress
*/
struct batadv_priv_gw {
- struct hlist_head list;
- spinlock_t list_lock; /* protects gw_list & curr_gw */
+ struct hlist_head gateway_list;
+ spinlock_t list_lock; /* protects gateway_list & curr_gw */
struct batadv_gw_node __rcu *curr_gw; /* rcu protected pointer */
atomic_t mode;
atomic_t sel_class;
@@ -785,9 +804,10 @@ struct batadv_mcast_querier_state {
* @num_want_all_ipv6: counter for items in want_all_ipv6_list
* @want_lists_lock: lock for protecting modifications to mcast want lists
* (traversals are rcu-locked)
+ * @work: work queue callback item for multicast TT and TVLV updates
*/
struct batadv_priv_mcast {
- struct hlist_head mla_list;
+ struct hlist_head mla_list; /* see __batadv_mcast_mla_update() */
struct hlist_head want_all_unsnoopables_list;
struct hlist_head want_all_ipv4_list;
struct hlist_head want_all_ipv6_list;
@@ -802,6 +822,7 @@ struct batadv_priv_mcast {
atomic_t num_want_all_ipv6;
/* protects want_all_{unsnoopables,ipv4,ipv6}_list */
spinlock_t want_lists_lock;
+ struct delayed_work work;
};
#endif
@@ -1363,7 +1384,8 @@ struct batadv_skb_cb {
/**
* struct batadv_forw_packet - structure for bcast packets to be sent/forwarded
- * @list: list node for batadv_socket_client::queue_list
+ * @list: list node for batadv_priv::forw_{bat,bcast}_list
+ * @cleanup_list: list node for purging functions
* @send_time: execution time for delayed_work (packet sending)
* @own: bool for locally generated packets (local OGMs are re-scheduled after
* sending)
@@ -1380,6 +1402,7 @@ struct batadv_skb_cb {
*/
struct batadv_forw_packet {
struct hlist_node list;
+ struct hlist_node cleanup_list;
unsigned long send_time;
u8 own;
struct sk_buff *skb;
@@ -1466,6 +1489,7 @@ struct batadv_algo_orig_ops {
/**
* struct batadv_algo_gw_ops - mesh algorithm callbacks (GW specific)
+ * @init_sel_class: initialize GW selection class (optional)
* @store_sel_class: parse and stores a new GW selection class (optional)
* @show_sel_class: prints the current GW selection class (optional)
* @get_best_gw_node: select the best GW from the list of available nodes
@@ -1476,6 +1500,7 @@ struct batadv_algo_orig_ops {
* @dump: dump gateways to a netlink socket (optional)
*/
struct batadv_algo_gw_ops {
+ void (*init_sel_class)(struct batadv_priv *bat_priv);
ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff,
size_t count);
ssize_t (*show_sel_class)(struct batadv_priv *bat_priv, char *buff);
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 1904a93f47d5..d491529332f4 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -920,7 +920,7 @@ static void chan_close_cb(struct l2cap_chan *chan)
BT_DBG("dev %p removing %speer %p", dev,
last ? "last " : "1 ", peer);
BT_DBG("chan %p orig refcnt %d", chan,
- atomic_read(&chan->kref.refcount));
+ kref_read(&chan->kref));
l2cap_chan_put(chan);
break;
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
index b3ff12eb9b6d..4bfaa19a5573 100644
--- a/net/bluetooth/Makefile
+++ b/net/bluetooth/Makefile
@@ -20,5 +20,3 @@ bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o
bluetooth-$(CONFIG_BT_LEDS) += leds.o
bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o
bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o
-
-subdir-ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c
index 5f123c3320a7..f0095fd79818 100644
--- a/net/bluetooth/a2mp.c
+++ b/net/bluetooth/a2mp.c
@@ -810,7 +810,7 @@ static struct l2cap_chan *a2mp_chan_open(struct l2cap_conn *conn, bool locked)
/* AMP Manager functions */
struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr)
{
- BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount));
+ BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
kref_get(&mgr->kref);
@@ -833,7 +833,7 @@ static void amp_mgr_destroy(struct kref *kref)
int amp_mgr_put(struct amp_mgr *mgr)
{
- BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount));
+ BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
return kref_put(&mgr->kref, &amp_mgr_destroy);
}
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 1aff2da9bc74..69e1f7d362a8 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -27,6 +27,8 @@
#include <linux/module.h>
#include <linux/debugfs.h>
#include <linux/stringify.h>
+#include <linux/sched/signal.h>
+
#include <asm/ioctls.h>
#include <net/bluetooth/bluetooth.h>
@@ -245,7 +247,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (err == 0) {
sock_recv_ts_and_drops(msg, sk, skb);
- if (bt_sk(sk)->skb_msg_name)
+ if (msg->msg_name && bt_sk(sk)->skb_msg_name)
bt_sk(sk)->skb_msg_name(skb, msg->msg_name,
&msg->msg_namelen);
}
diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c
index e32f34189007..02a4ccc04e1e 100644
--- a/net/bluetooth/amp.c
+++ b/net/bluetooth/amp.c
@@ -24,7 +24,7 @@
void amp_ctrl_get(struct amp_ctrl *ctrl)
{
BT_DBG("ctrl %p orig refcnt %d", ctrl,
- atomic_read(&ctrl->kref.refcount));
+ kref_read(&ctrl->kref));
kref_get(&ctrl->kref);
}
@@ -42,7 +42,7 @@ static void amp_ctrl_destroy(struct kref *kref)
int amp_ctrl_put(struct amp_ctrl *ctrl)
{
BT_DBG("ctrl %p orig refcnt %d", ctrl,
- atomic_read(&ctrl->kref.refcount));
+ kref_read(&ctrl->kref));
return kref_put(&ctrl->kref, &amp_ctrl_destroy);
}
diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c
index f4fcb4a9d5c1..2b875edf77e1 100644
--- a/net/bluetooth/bnep/netdev.c
+++ b/net/bluetooth/bnep/netdev.c
@@ -211,7 +211,6 @@ static const struct net_device_ops bnep_netdev_ops = {
.ndo_set_rx_mode = bnep_net_set_mc_list,
.ndo_set_mac_address = bnep_net_set_mac_addr,
.ndo_tx_timeout = bnep_net_timeout,
- .ndo_change_mtu = eth_change_mtu,
};
@@ -222,6 +221,8 @@ void bnep_net_setup(struct net_device *dev)
dev->addr_len = ETH_ALEN;
ether_setup(dev);
+ dev->min_mtu = 0;
+ dev->max_mtu = ETH_MAX_MTU;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->netdev_ops = &bnep_netdev_ops;
diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c
index 46ac686c8911..bb308224099c 100644
--- a/net/bluetooth/cmtp/capi.c
+++ b/net/bluetooth/cmtp/capi.c
@@ -26,7 +26,7 @@
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fcntl.h>
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index e17aacbc5630..0b4dba08a14e 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4749,7 +4749,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
case LE_ADV_SCAN_RSP:
break;
default:
- BT_ERR_RATELIMITED("Unknown advetising packet type: 0x%02x",
+ BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x",
type);
return;
}
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 1015d9c8d97d..b5faff458d8b 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -21,6 +21,8 @@
SOFTWARE IS DISCLAIMED.
*/
+#include <linux/sched/signal.h>
+
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/mgmt.h>
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 48f9471e7c85..f64d6566021f 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -851,7 +851,7 @@ static int hci_sock_release(struct socket *sock)
if (hdev) {
if (hci_pi(sk)->channel == HCI_CHANNEL_USER) {
- /* When releasing an user channel exclusive access,
+ /* When releasing a user channel exclusive access,
* call hci_dev_do_close directly instead of calling
* hci_dev_close to ensure the exclusive access will
* be released and the controller brought back down.
@@ -1172,7 +1172,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
/* In case the transport is already up and
* running, clear the error here.
*
- * This can happen when opening an user
+ * This can happen when opening a user
* channel and HCI_AUTO_OFF grace period
* is still active.
*/
@@ -1190,7 +1190,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
if (!hci_sock_gen_cookie(sk)) {
/* In the case when a cookie has already been assigned,
* this socket will transition from a raw socket into
- * an user channel socket. For a clean transition, send
+ * a user channel socket. For a clean transition, send
* the close notification first.
*/
skb = create_monitor_ctrl_close(sk);
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 577f1c01454a..fc7f321a3823 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -481,14 +481,14 @@ static void l2cap_chan_destroy(struct kref *kref)
void l2cap_chan_hold(struct l2cap_chan *c)
{
- BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount));
+ BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));
kref_get(&c->kref);
}
void l2cap_chan_put(struct l2cap_chan *c)
{
- BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount));
+ BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));
kref_put(&c->kref, l2cap_chan_destroy);
}
@@ -2127,7 +2127,7 @@ static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan,
struct sk_buff **frag;
int sent = 0;
- if (copy_from_iter(skb_put(skb, count), count, &msg->msg_iter) != count)
+ if (!copy_from_iter_full(skb_put(skb, count), count, &msg->msg_iter))
return -EFAULT;
sent += count;
@@ -2147,8 +2147,8 @@ static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan,
*frag = tmp;
- if (copy_from_iter(skb_put(*frag, count), count,
- &msg->msg_iter) != count)
+ if (!copy_from_iter_full(skb_put(*frag, count), count,
+ &msg->msg_iter))
return -EFAULT;
sent += count;
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index a8ba752732c9..507b80d59dec 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -29,6 +29,7 @@
#include <linux/module.h>
#include <linux/export.h>
+#include <linux/sched/signal.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
@@ -300,7 +301,7 @@ done:
}
static int l2cap_sock_accept(struct socket *sock, struct socket *newsock,
- int flags)
+ int flags, bool kern)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct sock *sk = sock->sk, *nsk;
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 7511df72347f..ac3c650cb234 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -27,6 +27,7 @@
#include <linux/export.h>
#include <linux/debugfs.h>
+#include <linux/sched/signal.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
@@ -470,7 +471,8 @@ done:
return err;
}
-static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags)
+static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct sock *sk = sock->sk, *nsk;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 3125ce670c2f..728e0c8dc8e7 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -27,6 +27,7 @@
#include <linux/module.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
+#include <linux/sched/signal.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
@@ -626,7 +627,7 @@ done:
}
static int sco_sock_accept(struct socket *sock, struct socket *newsock,
- int flags)
+ int flags, bool kern)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct sock *sk = sock->sk, *ch;
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 43faf2aea2ab..fae391f1871f 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -57,7 +57,7 @@
#define SMP_TIMEOUT msecs_to_jiffies(30000)
#define AUTH_REQ_MASK(dev) (hci_dev_test_flag(dev, HCI_SC_ENABLED) ? \
- 0x1f : 0x07)
+ 0x3f : 0x07)
#define KEY_DIST_MASK 0x07
/* Maximum message length that can be passed to aes_cmac */
@@ -76,6 +76,7 @@ enum {
SMP_FLAG_DHKEY_PENDING,
SMP_FLAG_REMOTE_OOB,
SMP_FLAG_LOCAL_OOB,
+ SMP_FLAG_CT2,
};
struct smp_dev {
@@ -357,6 +358,22 @@ static int smp_h6(struct crypto_shash *tfm_cmac, const u8 w[16],
return err;
}
+static int smp_h7(struct crypto_shash *tfm_cmac, const u8 w[16],
+ const u8 salt[16], u8 res[16])
+{
+ int err;
+
+ SMP_DBG("w %16phN salt %16phN", w, salt);
+
+ err = aes_cmac(tfm_cmac, salt, w, 16, res);
+ if (err)
+ return err;
+
+ SMP_DBG("res %16phN", res);
+
+ return err;
+}
+
/* The following functions map to the legacy SMP crypto functions e, c1,
* s1 and ah.
*/
@@ -1130,20 +1147,31 @@ static void sc_add_ltk(struct smp_chan *smp)
static void sc_generate_link_key(struct smp_chan *smp)
{
- /* These constants are as specified in the core specification.
- * In ASCII they spell out to 'tmp1' and 'lebr'.
- */
- const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 };
+ /* From core spec. Spells out in ASCII as 'lebr'. */
const u8 lebr[4] = { 0x72, 0x62, 0x65, 0x6c };
smp->link_key = kzalloc(16, GFP_KERNEL);
if (!smp->link_key)
return;
- if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) {
- kzfree(smp->link_key);
- smp->link_key = NULL;
- return;
+ if (test_bit(SMP_FLAG_CT2, &smp->flags)) {
+ /* SALT = 0x00000000000000000000000000000000746D7031 */
+ const u8 salt[16] = { 0x31, 0x70, 0x6d, 0x74 };
+
+ if (smp_h7(smp->tfm_cmac, smp->tk, salt, smp->link_key)) {
+ kzfree(smp->link_key);
+ smp->link_key = NULL;
+ return;
+ }
+ } else {
+ /* From core spec. Spells out in ASCII as 'tmp1'. */
+ const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 };
+
+ if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) {
+ kzfree(smp->link_key);
+ smp->link_key = NULL;
+ return;
+ }
}
if (smp_h6(smp->tfm_cmac, smp->link_key, lebr, smp->link_key)) {
@@ -1169,10 +1197,7 @@ static void smp_allow_key_dist(struct smp_chan *smp)
static void sc_generate_ltk(struct smp_chan *smp)
{
- /* These constants are as specified in the core specification.
- * In ASCII they spell out to 'tmp2' and 'brle'.
- */
- const u8 tmp2[4] = { 0x32, 0x70, 0x6d, 0x74 };
+ /* From core spec. Spells out in ASCII as 'brle'. */
const u8 brle[4] = { 0x65, 0x6c, 0x72, 0x62 };
struct hci_conn *hcon = smp->conn->hcon;
struct hci_dev *hdev = hcon->hdev;
@@ -1187,8 +1212,19 @@ static void sc_generate_ltk(struct smp_chan *smp)
if (key->type == HCI_LK_DEBUG_COMBINATION)
set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags);
- if (smp_h6(smp->tfm_cmac, key->val, tmp2, smp->tk))
- return;
+ if (test_bit(SMP_FLAG_CT2, &smp->flags)) {
+ /* SALT = 0x00000000000000000000000000000000746D7032 */
+ const u8 salt[16] = { 0x32, 0x70, 0x6d, 0x74 };
+
+ if (smp_h7(smp->tfm_cmac, key->val, salt, smp->tk))
+ return;
+ } else {
+ /* From core spec. Spells out in ASCII as 'tmp2'. */
+ const u8 tmp2[4] = { 0x32, 0x70, 0x6d, 0x74 };
+
+ if (smp_h6(smp->tfm_cmac, key->val, tmp2, smp->tk))
+ return;
+ }
if (smp_h6(smp->tfm_cmac, smp->tk, brle, smp->tk))
return;
@@ -1669,6 +1705,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp,
if (!rsp) {
memset(req, 0, sizeof(*req));
+ req->auth_req = SMP_AUTH_CT2;
req->init_key_dist = local_dist;
req->resp_key_dist = remote_dist;
req->max_key_size = conn->hcon->enc_key_size;
@@ -1680,6 +1717,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp,
memset(rsp, 0, sizeof(*rsp));
+ rsp->auth_req = SMP_AUTH_CT2;
rsp->max_key_size = conn->hcon->enc_key_size;
rsp->init_key_dist = req->init_key_dist & remote_dist;
rsp->resp_key_dist = req->resp_key_dist & local_dist;
@@ -1744,6 +1782,9 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
build_bredr_pairing_cmd(smp, req, &rsp);
+ if (req->auth_req & SMP_AUTH_CT2)
+ set_bit(SMP_FLAG_CT2, &smp->flags);
+
key_size = min(req->max_key_size, rsp.max_key_size);
if (check_enc_key_size(conn, key_size))
return SMP_ENC_KEY_SIZE;
@@ -1761,9 +1802,13 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
build_pairing_cmd(conn, req, &rsp, auth);
- if (rsp.auth_req & SMP_AUTH_SC)
+ if (rsp.auth_req & SMP_AUTH_SC) {
set_bit(SMP_FLAG_SC, &smp->flags);
+ if (rsp.auth_req & SMP_AUTH_CT2)
+ set_bit(SMP_FLAG_CT2, &smp->flags);
+ }
+
if (conn->hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT)
sec_level = BT_SECURITY_MEDIUM;
else
@@ -1917,6 +1962,9 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb)
*/
smp->remote_key_dist &= rsp->resp_key_dist;
+ if ((req->auth_req & SMP_AUTH_CT2) && (auth & SMP_AUTH_CT2))
+ set_bit(SMP_FLAG_CT2, &smp->flags);
+
/* For BR/EDR this means we're done and can start phase 3 */
if (conn->hcon->type == ACL_LINK) {
/* Clear bits which are generated but not distributed */
@@ -2312,8 +2360,11 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
authreq = seclevel_to_authreq(sec_level);
- if (hci_dev_test_flag(hcon->hdev, HCI_SC_ENABLED))
+ if (hci_dev_test_flag(hcon->hdev, HCI_SC_ENABLED)) {
authreq |= SMP_AUTH_SC;
+ if (hci_dev_test_flag(hcon->hdev, HCI_SSP_ENABLED))
+ authreq |= SMP_AUTH_CT2;
+ }
/* Require MITM if IO Capability allows or the security level
* requires it.
diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h
index ffcc70b6b199..0ff6247eaa6c 100644
--- a/net/bluetooth/smp.h
+++ b/net/bluetooth/smp.h
@@ -57,6 +57,7 @@ struct smp_cmd_pairing {
#define SMP_AUTH_MITM 0x04
#define SMP_AUTH_SC 0x08
#define SMP_AUTH_KEYPRESS 0x10
+#define SMP_AUTH_CT2 0x20
#define SMP_CMD_PAIRING_CONFIRM 0x03
struct smp_cmd_pairing_confirm {
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index 0aefc011b668..40b1ede527ca 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_BRIDGE) += bridge.o
bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
br_ioctl.o br_stp.o br_stp_bpdu.o \
- br_stp_if.o br_stp_timer.o br_netlink.o
+ br_stp_if.o br_stp_timer.o br_netlink.o \
+ br_netlink_tunnel.o
bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
@@ -18,7 +19,7 @@ obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o
bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o
-bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o
+bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o
bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 89a687f3c0a3..90f49a194249 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -19,7 +19,7 @@
#include <linux/list.h>
#include <linux/netfilter_bridge.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "br_private.h"
#define COMMON_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | \
@@ -79,7 +79,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
br_multicast_flood(mdst, skb, false, true);
else
br_flood(br, skb, BR_PKT_MULTICAST, false, true);
- } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) {
+ } else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) {
br_forward(dst->dst, skb, false, true);
} else {
br_flood(br, skb, BR_PKT_UNICAST, false, true);
@@ -119,6 +119,15 @@ static int br_dev_init(struct net_device *dev)
return err;
}
+static void br_dev_uninit(struct net_device *dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ br_multicast_uninit_stats(br);
+ br_vlan_flush(br);
+ free_percpu(br->stats);
+}
+
static int br_dev_open(struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
@@ -153,8 +162,8 @@ static int br_dev_stop(struct net_device *dev)
return 0;
}
-static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *stats)
+static void br_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *stats)
{
struct net_bridge *br = netdev_priv(dev);
struct pcpu_sw_netstats tmp, sum = { 0 };
@@ -178,14 +187,12 @@ static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev,
stats->tx_packets = sum.tx_packets;
stats->rx_bytes = sum.rx_bytes;
stats->rx_packets = sum.rx_packets;
-
- return stats;
}
static int br_change_mtu(struct net_device *dev, int new_mtu)
{
struct net_bridge *br = netdev_priv(dev);
- if (new_mtu < 68 || new_mtu > br_min_mtu(br))
+ if (new_mtu > br_min_mtu(br))
return -EINVAL;
dev->mtu = new_mtu;
@@ -334,6 +341,7 @@ static const struct net_device_ops br_netdev_ops = {
.ndo_open = br_dev_open,
.ndo_stop = br_dev_stop,
.ndo_init = br_dev_init,
+ .ndo_uninit = br_dev_uninit,
.ndo_start_xmit = br_dev_xmit,
.ndo_get_stats64 = br_get_stats64,
.ndo_set_mac_address = br_set_mac_address,
@@ -349,8 +357,6 @@ static const struct net_device_ops br_netdev_ops = {
.ndo_add_slave = br_add_slave,
.ndo_del_slave = br_del_slave,
.ndo_fix_features = br_fix_features,
- .ndo_neigh_construct = netdev_default_l2upper_neigh_construct,
- .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy,
.ndo_fdb_add = br_fdb_add,
.ndo_fdb_del = br_fdb_delete,
.ndo_fdb_dump = br_fdb_dump,
@@ -360,14 +366,6 @@ static const struct net_device_ops br_netdev_ops = {
.ndo_features_check = passthru_features_check,
};
-static void br_dev_free(struct net_device *dev)
-{
- struct net_bridge *br = netdev_priv(dev);
-
- free_percpu(br->stats);
- free_netdev(dev);
-}
-
static struct device_type br_type = {
.name = "bridge",
};
@@ -380,7 +378,7 @@ void br_dev_setup(struct net_device *dev)
ether_setup(dev);
dev->netdev_ops = &br_netdev_ops;
- dev->destructor = br_dev_free;
+ dev->destructor = free_netdev;
dev->ethtool_ops = &br_ethtool_ops;
SET_NETDEV_DEVTYPE(dev, &br_type);
dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE;
@@ -409,9 +407,11 @@ void br_dev_setup(struct net_device *dev)
br->bridge_max_age = br->max_age = 20 * HZ;
br->bridge_hello_time = br->hello_time = 2 * HZ;
br->bridge_forward_delay = br->forward_delay = 15 * HZ;
- br->ageing_time = BR_DEFAULT_AGEING_TIME;
+ br->bridge_ageing_time = br->ageing_time = BR_DEFAULT_AGEING_TIME;
+ dev->max_mtu = ETH_MAX_MTU;
br_netfilter_rtable_init(br);
br_stp_timer_init(br);
br_multicast_init(br);
+ INIT_DELAYED_WORK(&br->gc_work, br_fdb_cleanup);
}
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 6b43c8c88f19..6e08b7199dd7 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -28,9 +28,6 @@
#include "br_private.h"
static struct kmem_cache *br_fdb_cache __read_mostly;
-static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
- const unsigned char *addr,
- __u16 vid);
static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char *addr, u16 vid);
static void fdb_notify(struct net_bridge *br,
@@ -68,7 +65,7 @@ static inline unsigned long hold_time(const struct net_bridge *br)
static inline int has_expired(const struct net_bridge *br,
const struct net_bridge_fdb_entry *fdb)
{
- return !fdb->is_static &&
+ return !fdb->is_static && !fdb->added_by_external_learn &&
time_before_eq(fdb->updated + hold_time(br), jiffies);
}
@@ -86,6 +83,47 @@ static void fdb_rcu_free(struct rcu_head *head)
kmem_cache_free(br_fdb_cache, ent);
}
+static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
+ const unsigned char *addr,
+ __u16 vid)
+{
+ struct net_bridge_fdb_entry *f;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ hlist_for_each_entry_rcu(f, head, hlist)
+ if (ether_addr_equal(f->addr.addr, addr) && f->vlan_id == vid)
+ break;
+
+ return f;
+}
+
+/* requires bridge hash_lock */
+static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br,
+ const unsigned char *addr,
+ __u16 vid)
+{
+ struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
+ struct net_bridge_fdb_entry *fdb;
+
+ lockdep_assert_held_once(&br->hash_lock);
+
+ rcu_read_lock();
+ fdb = fdb_find_rcu(head, addr, vid);
+ rcu_read_unlock();
+
+ return fdb;
+}
+
+struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
+ const unsigned char *addr,
+ __u16 vid)
+{
+ struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
+
+ return fdb_find_rcu(head, addr, vid);
+}
+
/* When a static FDB entry is added, the mac address from the entry is
* added to the bridge private HW address list and all required ports
* are then updated with the new information.
@@ -154,7 +192,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
if (f->added_by_external_learn)
fdb_del_external_learn(f);
- hlist_del_rcu(&f->hlist);
+ hlist_del_init_rcu(&f->hlist);
fdb_notify(br, f, RTM_DELNEIGH);
call_rcu(&f->rcu, fdb_rcu_free);
}
@@ -198,11 +236,10 @@ void br_fdb_find_delete_local(struct net_bridge *br,
const struct net_bridge_port *p,
const unsigned char *addr, u16 vid)
{
- struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
struct net_bridge_fdb_entry *f;
spin_lock_bh(&br->hash_lock);
- f = fdb_find(head, addr, vid);
+ f = br_fdb_find(br, addr, vid);
if (f && f->is_local && !f->added_by_user && f->dst == p)
fdb_delete_local(br, p, f);
spin_unlock_bh(&br->hash_lock);
@@ -266,7 +303,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
spin_lock_bh(&br->hash_lock);
/* If old entry was unassociated with any port, then delete it. */
- f = __br_fdb_get(br, br->dev->dev_addr, 0);
+ f = br_fdb_find(br, br->dev->dev_addr, 0);
if (f && f->is_local && !f->dst && !f->added_by_user)
fdb_delete_local(br, NULL, f);
@@ -281,7 +318,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
list_for_each_entry(v, &vg->vlan_list, vlist) {
if (!br_vlan_should_use(v))
continue;
- f = __br_fdb_get(br, br->dev->dev_addr, v->vid);
+ f = br_fdb_find(br, br->dev->dev_addr, v->vid);
if (f && f->is_local && !f->dst && !f->added_by_user)
fdb_delete_local(br, NULL, f);
fdb_insert(br, NULL, newaddr, v->vid);
@@ -290,34 +327,43 @@ out:
spin_unlock_bh(&br->hash_lock);
}
-void br_fdb_cleanup(unsigned long _data)
+void br_fdb_cleanup(struct work_struct *work)
{
- struct net_bridge *br = (struct net_bridge *)_data;
+ struct net_bridge *br = container_of(work, struct net_bridge,
+ gc_work.work);
unsigned long delay = hold_time(br);
- unsigned long next_timer = jiffies + br->ageing_time;
+ unsigned long work_delay = delay;
+ unsigned long now = jiffies;
int i;
- spin_lock(&br->hash_lock);
for (i = 0; i < BR_HASH_SIZE; i++) {
struct net_bridge_fdb_entry *f;
struct hlist_node *n;
+ if (!br->hash[i].first)
+ continue;
+
+ spin_lock_bh(&br->hash_lock);
hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) {
unsigned long this_timer;
+
if (f->is_static)
continue;
if (f->added_by_external_learn)
continue;
this_timer = f->updated + delay;
- if (time_before_eq(this_timer, jiffies))
+ if (time_after(this_timer, now))
+ work_delay = min(work_delay, this_timer - now);
+ else
fdb_delete(br, f);
- else if (time_before(this_timer, next_timer))
- next_timer = this_timer;
}
+ spin_unlock_bh(&br->hash_lock);
+ cond_resched();
}
- spin_unlock(&br->hash_lock);
- mod_timer(&br->gc_timer, round_jiffies_up(next_timer));
+ /* Cleanup minimum 10 milliseconds apart */
+ work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10));
+ mod_delayed_work(system_long_wq, &br->gc_work, work_delay);
}
/* Completely flush all dynamic entries in forwarding database.*/
@@ -371,26 +417,6 @@ void br_fdb_delete_by_port(struct net_bridge *br,
spin_unlock_bh(&br->hash_lock);
}
-/* No locking or refcounting, assumes caller has rcu_read_lock */
-struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
- const unsigned char *addr,
- __u16 vid)
-{
- struct net_bridge_fdb_entry *fdb;
-
- hlist_for_each_entry_rcu(fdb,
- &br->hash[br_mac_hash(addr, vid)], hlist) {
- if (ether_addr_equal(fdb->addr.addr, addr) &&
- fdb->vlan_id == vid) {
- if (unlikely(has_expired(br, fdb)))
- break;
- return fdb;
- }
- }
-
- return NULL;
-}
-
#if IS_ENABLED(CONFIG_ATM_LANE)
/* Interface used by ATM LANE hook to test
* if an addr is on some other bridge port */
@@ -405,7 +431,7 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
if (!port)
ret = 0;
else {
- fdb = __br_fdb_get(port->br, addr, 0);
+ fdb = br_fdb_find_rcu(port->br, addr, 0);
ret = fdb && fdb->dst && fdb->dst->dev != dev &&
fdb->dst->state == BR_STATE_FORWARDING;
}
@@ -467,34 +493,6 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf,
return num;
}
-static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
- const unsigned char *addr,
- __u16 vid)
-{
- struct net_bridge_fdb_entry *fdb;
-
- hlist_for_each_entry(fdb, head, hlist) {
- if (ether_addr_equal(fdb->addr.addr, addr) &&
- fdb->vlan_id == vid)
- return fdb;
- }
- return NULL;
-}
-
-static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
- const unsigned char *addr,
- __u16 vid)
-{
- struct net_bridge_fdb_entry *fdb;
-
- hlist_for_each_entry_rcu(fdb, head, hlist) {
- if (ether_addr_equal(fdb->addr.addr, addr) &&
- fdb->vlan_id == vid)
- return fdb;
- }
- return NULL;
-}
-
static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
struct net_bridge_port *source,
const unsigned char *addr,
@@ -528,16 +526,15 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
if (!is_valid_ether_addr(addr))
return -EINVAL;
- fdb = fdb_find(head, addr, vid);
+ fdb = br_fdb_find(br, addr, vid);
if (fdb) {
/* it is okay to have multiple ports with same
* address, just use the first one.
*/
if (fdb->is_local)
return 0;
- br_warn(br, "adding interface %s with same address "
- "as a received packet\n",
- source ? source->dev->name : br->dev->name);
+ br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n",
+ source ? source->dev->name : br->dev->name, addr, vid);
fdb_delete(br, fdb);
}
@@ -583,16 +580,18 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
/* attempt to update an entry for a local interface */
if (unlikely(fdb->is_local)) {
if (net_ratelimit())
- br_warn(br, "received packet on %s with "
- "own address as source address\n",
- source->dev->name);
+ br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n",
+ source->dev->name, addr, vid);
} else {
+ unsigned long now = jiffies;
+
/* fastpath: update of existing entry */
if (unlikely(source != fdb->dst)) {
fdb->dst = source;
fdb_modified = true;
}
- fdb->updated = jiffies;
+ if (now != fdb->updated)
+ fdb->updated = now;
if (unlikely(added_by_user))
fdb->added_by_user = 1;
if (unlikely(fdb_modified))
@@ -600,7 +599,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
}
} else {
spin_lock(&br->hash_lock);
- if (likely(!fdb_find(head, addr, vid))) {
+ if (likely(!fdb_find_rcu(head, addr, vid))) {
fdb = fdb_create(head, source, addr, vid, 0, 0);
if (fdb) {
if (unlikely(added_by_user))
@@ -784,7 +783,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
return -EINVAL;
}
- fdb = fdb_find(head, addr, vid);
+ fdb = br_fdb_find(br, addr, vid);
if (fdb == NULL) {
if (!(flags & NLM_F_CREATE))
return -ENOENT;
@@ -931,55 +930,30 @@ out:
return err;
}
-static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr,
- u16 vid)
-{
- struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
- struct net_bridge_fdb_entry *fdb;
-
- fdb = fdb_find(head, addr, vid);
- if (!fdb)
- return -ENOENT;
-
- fdb_delete(br, fdb);
- return 0;
-}
-
-static int __br_fdb_delete_by_addr(struct net_bridge *br,
- const unsigned char *addr, u16 vid)
-{
- int err;
-
- spin_lock_bh(&br->hash_lock);
- err = fdb_delete_by_addr(br, addr, vid);
- spin_unlock_bh(&br->hash_lock);
-
- return err;
-}
-
-static int fdb_delete_by_addr_and_port(struct net_bridge_port *p,
+static int fdb_delete_by_addr_and_port(struct net_bridge *br,
+ const struct net_bridge_port *p,
const u8 *addr, u16 vlan)
{
- struct net_bridge *br = p->br;
- struct hlist_head *head = &br->hash[br_mac_hash(addr, vlan)];
struct net_bridge_fdb_entry *fdb;
- fdb = fdb_find(head, addr, vlan);
+ fdb = br_fdb_find(br, addr, vlan);
if (!fdb || fdb->dst != p)
return -ENOENT;
fdb_delete(br, fdb);
+
return 0;
}
-static int __br_fdb_delete(struct net_bridge_port *p,
+static int __br_fdb_delete(struct net_bridge *br,
+ const struct net_bridge_port *p,
const unsigned char *addr, u16 vid)
{
int err;
- spin_lock_bh(&p->br->hash_lock);
- err = fdb_delete_by_addr_and_port(p, addr, vid);
- spin_unlock_bh(&p->br->hash_lock);
+ spin_lock_bh(&br->hash_lock);
+ err = fdb_delete_by_addr_and_port(br, p, addr, vid);
+ spin_unlock_bh(&br->hash_lock);
return err;
}
@@ -992,7 +966,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
struct net_bridge_vlan_group *vg;
struct net_bridge_port *p = NULL;
struct net_bridge_vlan *v;
- struct net_bridge *br = NULL;
+ struct net_bridge *br;
int err;
if (dev->priv_flags & IFF_EBRIDGE) {
@@ -1006,6 +980,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
return -EINVAL;
}
vg = nbp_vlan_group(p);
+ br = p->br;
}
if (vid) {
@@ -1015,30 +990,20 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
return -EINVAL;
}
- if (dev->priv_flags & IFF_EBRIDGE)
- err = __br_fdb_delete_by_addr(br, addr, vid);
- else
- err = __br_fdb_delete(p, addr, vid);
+ err = __br_fdb_delete(br, p, addr, vid);
} else {
err = -ENOENT;
- if (dev->priv_flags & IFF_EBRIDGE)
- err = __br_fdb_delete_by_addr(br, addr, 0);
- else
- err &= __br_fdb_delete(p, addr, 0);
-
+ err &= __br_fdb_delete(br, p, addr, 0);
if (!vg || !vg->num_vlans)
- goto out;
+ return err;
list_for_each_entry(v, &vg->vlan_list, vlist) {
if (!br_vlan_should_use(v))
continue;
- if (dev->priv_flags & IFF_EBRIDGE)
- err = __br_fdb_delete_by_addr(br, addr, v->vid);
- else
- err &= __br_fdb_delete(p, addr, v->vid);
+ err &= __br_fdb_delete(br, p, addr, v->vid);
}
}
-out:
+
return err;
}
@@ -1109,7 +1074,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
spin_lock_bh(&br->hash_lock);
head = &br->hash[br_mac_hash(addr, vid)];
- fdb = fdb_find(head, addr, vid);
+ fdb = br_fdb_find(br, addr, vid);
if (!fdb) {
fdb = fdb_create(head, p, addr, vid, 0, 0);
if (!fdb) {
@@ -1137,15 +1102,13 @@ err_unlock:
int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
const unsigned char *addr, u16 vid)
{
- struct hlist_head *head;
struct net_bridge_fdb_entry *fdb;
int err = 0;
ASSERT_RTNL();
spin_lock_bh(&br->hash_lock);
- head = &br->hash[br_mac_hash(addr, vid)];
- fdb = fdb_find(head, addr, vid);
+ fdb = br_fdb_find(br, addr, vid);
if (fdb && fdb->added_by_external_learn)
fdb_delete(br, fdb);
else
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 7cb41aee4c82..902af6ba481c 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -80,7 +80,7 @@ static void __br_forward(const struct net_bridge_port *to,
int br_hook;
vg = nbp_vlan_group_rcu(to);
- skb = br_handle_vlan(to->br, vg, skb);
+ skb = br_handle_vlan(to->br, to, vg, skb);
if (!skb)
return;
@@ -186,8 +186,9 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
/* Do not flood unicast traffic to ports that turn it off */
if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD))
continue;
+ /* Do not flood if mc off, except for traffic we originate */
if (pkt_type == BR_PKT_MULTICAST &&
- !(p->flags & BR_MCAST_FLOOD))
+ !(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
continue;
/* Do not flood to ports that enable proxy ARP */
@@ -220,6 +221,31 @@ out:
}
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb,
+ const unsigned char *addr, bool local_orig)
+{
+ struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
+ const unsigned char *src = eth_hdr(skb)->h_source;
+
+ if (!should_deliver(p, skb))
+ return;
+
+ /* Even with hairpin, no soliloquies - prevent breaking IPv6 DAD */
+ if (skb->dev == p->dev && ether_addr_equal(src, addr))
+ return;
+
+ skb = skb_copy(skb, GFP_ATOMIC);
+ if (!skb) {
+ dev->stats.tx_dropped++;
+ return;
+ }
+
+ if (!is_broadcast_ether_addr(addr))
+ memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN);
+
+ __br_forward(p, skb, local_orig);
+}
+
/* called with rcu_read_lock */
void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
struct sk_buff *skb,
@@ -241,10 +267,20 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) :
NULL;
- port = (unsigned long)lport > (unsigned long)rport ?
- lport : rport;
+ if ((unsigned long)lport > (unsigned long)rport) {
+ port = lport;
+
+ if (port->flags & BR_MULTICAST_TO_UNICAST) {
+ maybe_deliver_addr(lport, skb, p->eth_addr,
+ local_orig);
+ goto delivered;
+ }
+ } else {
+ port = rport;
+ }
prev = maybe_deliver(prev, port, skb, local_orig);
+delivered:
if (IS_ERR(prev))
goto out;
if (prev == port)
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index ed0dd3340084..56a2a72e7738 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -311,9 +311,8 @@ void br_dev_delete(struct net_device *dev, struct list_head *head)
br_fdb_delete_by_port(br, NULL, 0, 1);
- br_vlan_flush(br);
br_multicast_dev_del(br);
- del_timer_sync(&br->gc_timer);
+ cancel_delayed_work_sync(&br->gc_work);
br_sysfs_delbr(br->dev);
unregister_netdevice_queue(br->dev, head);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 855b72fbe1da..013f2290bfa5 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -21,6 +21,7 @@
#include <linux/export.h>
#include <linux/rculist.h>
#include "br_private.h"
+#include "br_private_tunnel.h"
/* Hook for brouter */
br_should_route_hook_t __rcu *br_should_route_hook __read_mostly;
@@ -29,6 +30,7 @@ EXPORT_SYMBOL(br_should_route_hook);
static int
br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ br_drop_fake_rtable(skb);
return netif_receive_skb(skb);
}
@@ -57,7 +59,7 @@ static int br_pass_frame_up(struct sk_buff *skb)
indev = skb->dev;
skb->dev = brdev;
- skb = br_handle_vlan(br, vg, skb);
+ skb = br_handle_vlan(br, NULL, vg, skb);
if (!skb)
return NET_RX_DROP;
/* update the multicast stats if the packet is IGMP/MLD */
@@ -113,7 +115,7 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
return;
}
- f = __br_fdb_get(br, n->ha, vid);
+ f = br_fdb_find_rcu(br, n->ha, vid);
if (f && ((p->flags & BR_PROXYARP) ||
(f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) {
arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip,
@@ -188,16 +190,19 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
}
break;
case BR_PKT_UNICAST:
- dst = __br_fdb_get(br, dest, vid);
+ dst = br_fdb_find_rcu(br, dest, vid);
default:
break;
}
if (dst) {
+ unsigned long now = jiffies;
+
if (dst->is_local)
return br_pass_frame_up(skb);
- dst->used = jiffies;
+ if (now != dst->used)
+ dst->used = now;
br_forward(dst->dst, skb, local_rcv, false);
} else {
if (!mcast_hit)
@@ -261,6 +266,11 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
return RX_HANDLER_CONSUMED;
p = br_port_get_rcu(skb->dev);
+ if (p->flags & BR_VLAN_TUNNEL) {
+ if (br_handle_ingress_vlan_tunnel(skb, p,
+ nbp_vlan_group_rcu(p)))
+ goto drop;
+ }
if (unlikely(is_link_local_ether_addr(dest))) {
u16 fwd_mask = p->br->group_fwd_mask_required;
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index d99b2009771a..7970f8540cbb 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -18,7 +18,7 @@
#include <linux/slab.h>
#include <linux/times.h>
#include <net/net_namespace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "br_private.h"
static int get_bridge_ifindices(struct net *net, int *indices, int num)
@@ -149,7 +149,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
b.hello_timer_value = br_timer_value(&br->hello_timer);
b.tcn_timer_value = br_timer_value(&br->tcn_timer);
b.topology_change_timer_value = br_timer_value(&br->topology_change_timer);
- b.gc_timer_value = br_timer_value(&br->gc_timer);
+ b.gc_timer_value = br_timer_value(&br->gc_work.timer);
rcu_read_unlock();
if (copy_to_user((void __user *)args[1], &b, sizeof(b)))
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 7dbc80d01eb0..056e6ac49d8f 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -531,7 +531,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
break;
}
- p = br_multicast_new_port_group(port, group, *pp, state);
+ p = br_multicast_new_port_group(port, group, *pp, state, NULL);
if (unlikely(!p))
return -ENOMEM;
rcu_assign_pointer(*pp, p);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 2136e45f5277..faa7261a992f 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -25,7 +25,9 @@
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/inetdevice.h>
+#include <linux/mroute.h>
#include <net/ip.h>
+#include <net/switchdev.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
#include <net/mld.h>
@@ -42,12 +44,15 @@ static void br_multicast_add_router(struct net_bridge *br,
static void br_ip4_multicast_leave_group(struct net_bridge *br,
struct net_bridge_port *port,
__be32 group,
- __u16 vid);
+ __u16 vid,
+ const unsigned char *src);
+
+static void __del_port_router(struct net_bridge_port *p);
#if IS_ENABLED(CONFIG_IPV6)
static void br_ip6_multicast_leave_group(struct net_bridge *br,
struct net_bridge_port *port,
const struct in6_addr *group,
- __u16 vid);
+ __u16 vid, const unsigned char *src);
#endif
unsigned int br_mdb_rehash_seq;
@@ -364,13 +369,18 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
__be32 group,
u8 *igmp_type)
{
+ struct igmpv3_query *ihv3;
+ size_t igmp_hdr_size;
struct sk_buff *skb;
struct igmphdr *ih;
struct ethhdr *eth;
struct iphdr *iph;
+ igmp_hdr_size = sizeof(*ih);
+ if (br->multicast_igmp_version == 3)
+ igmp_hdr_size = sizeof(*ihv3);
skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*iph) +
- sizeof(*ih) + 4);
+ igmp_hdr_size + 4);
if (!skb)
goto out;
@@ -395,7 +405,7 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
iph->version = 4;
iph->ihl = 6;
iph->tos = 0xc0;
- iph->tot_len = htons(sizeof(*iph) + sizeof(*ih) + 4);
+ iph->tot_len = htons(sizeof(*iph) + igmp_hdr_size + 4);
iph->id = 0;
iph->frag_off = htons(IP_DF);
iph->ttl = 1;
@@ -411,17 +421,37 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
skb_put(skb, 24);
skb_set_transport_header(skb, skb->len);
- ih = igmp_hdr(skb);
*igmp_type = IGMP_HOST_MEMBERSHIP_QUERY;
- ih->type = IGMP_HOST_MEMBERSHIP_QUERY;
- ih->code = (group ? br->multicast_last_member_interval :
- br->multicast_query_response_interval) /
- (HZ / IGMP_TIMER_SCALE);
- ih->group = group;
- ih->csum = 0;
- ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
- skb_put(skb, sizeof(*ih));
+ switch (br->multicast_igmp_version) {
+ case 2:
+ ih = igmp_hdr(skb);
+ ih->type = IGMP_HOST_MEMBERSHIP_QUERY;
+ ih->code = (group ? br->multicast_last_member_interval :
+ br->multicast_query_response_interval) /
+ (HZ / IGMP_TIMER_SCALE);
+ ih->group = group;
+ ih->csum = 0;
+ ih->csum = ip_compute_csum((void *)ih, sizeof(*ih));
+ break;
+ case 3:
+ ihv3 = igmpv3_query_hdr(skb);
+ ihv3->type = IGMP_HOST_MEMBERSHIP_QUERY;
+ ihv3->code = (group ? br->multicast_last_member_interval :
+ br->multicast_query_response_interval) /
+ (HZ / IGMP_TIMER_SCALE);
+ ihv3->group = group;
+ ihv3->qqic = br->multicast_query_interval / HZ;
+ ihv3->nsrcs = 0;
+ ihv3->resv = 0;
+ ihv3->suppress = 0;
+ ihv3->qrv = 2;
+ ihv3->csum = 0;
+ ihv3->csum = ip_compute_csum((void *)ihv3, sizeof(*ihv3));
+ break;
+ }
+
+ skb_put(skb, igmp_hdr_size);
__skb_pull(skb, sizeof(*eth));
out:
@@ -433,15 +463,20 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
const struct in6_addr *grp,
u8 *igmp_type)
{
- struct sk_buff *skb;
+ struct mld2_query *mld2q;
+ unsigned long interval;
struct ipv6hdr *ip6h;
struct mld_msg *mldq;
+ size_t mld_hdr_size;
+ struct sk_buff *skb;
struct ethhdr *eth;
u8 *hopopt;
- unsigned long interval;
+ mld_hdr_size = sizeof(*mldq);
+ if (br->multicast_mld_version == 2)
+ mld_hdr_size = sizeof(*mld2q);
skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*ip6h) +
- 8 + sizeof(*mldq));
+ 8 + mld_hdr_size);
if (!skb)
goto out;
@@ -460,7 +495,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
ip6h = ipv6_hdr(skb);
*(__force __be32 *)ip6h = htonl(0x60000000);
- ip6h->payload_len = htons(8 + sizeof(*mldq));
+ ip6h->payload_len = htons(8 + mld_hdr_size);
ip6h->nexthdr = IPPROTO_HOPOPTS;
ip6h->hop_limit = 1;
ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1));
@@ -488,26 +523,47 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
/* ICMPv6 */
skb_set_transport_header(skb, skb->len);
- mldq = (struct mld_msg *) icmp6_hdr(skb);
-
interval = ipv6_addr_any(grp) ?
br->multicast_query_response_interval :
br->multicast_last_member_interval;
-
*igmp_type = ICMPV6_MGM_QUERY;
- mldq->mld_type = ICMPV6_MGM_QUERY;
- mldq->mld_code = 0;
- mldq->mld_cksum = 0;
- mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval));
- mldq->mld_reserved = 0;
- mldq->mld_mca = *grp;
-
- /* checksum */
- mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
- sizeof(*mldq), IPPROTO_ICMPV6,
- csum_partial(mldq,
- sizeof(*mldq), 0));
- skb_put(skb, sizeof(*mldq));
+ switch (br->multicast_mld_version) {
+ case 1:
+ mldq = (struct mld_msg *)icmp6_hdr(skb);
+ mldq->mld_type = ICMPV6_MGM_QUERY;
+ mldq->mld_code = 0;
+ mldq->mld_cksum = 0;
+ mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval));
+ mldq->mld_reserved = 0;
+ mldq->mld_mca = *grp;
+ mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ sizeof(*mldq), IPPROTO_ICMPV6,
+ csum_partial(mldq,
+ sizeof(*mldq),
+ 0));
+ break;
+ case 2:
+ mld2q = (struct mld2_query *)icmp6_hdr(skb);
+ mld2q->mld2q_mrc = htons((u16)jiffies_to_msecs(interval));
+ mld2q->mld2q_type = ICMPV6_MGM_QUERY;
+ mld2q->mld2q_code = 0;
+ mld2q->mld2q_cksum = 0;
+ mld2q->mld2q_resv1 = 0;
+ mld2q->mld2q_resv2 = 0;
+ mld2q->mld2q_suppress = 0;
+ mld2q->mld2q_qrv = 2;
+ mld2q->mld2q_nsrcs = 0;
+ mld2q->mld2q_qqic = br->multicast_query_interval / HZ;
+ mld2q->mld2q_mca = *grp;
+ mld2q->mld2q_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ sizeof(*mld2q),
+ IPPROTO_ICMPV6,
+ csum_partial(mld2q,
+ sizeof(*mld2q),
+ 0));
+ break;
+ }
+ skb_put(skb, mld_hdr_size);
__skb_pull(skb, sizeof(*eth));
@@ -607,7 +663,8 @@ err:
}
struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
- struct net_bridge_port *port, struct br_ip *group)
+ struct net_bridge_port *p,
+ struct br_ip *group)
{
struct net_bridge_mdb_htable *mdb;
struct net_bridge_mdb_entry *mp;
@@ -623,7 +680,7 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
}
hash = br_ip_hash(mdb, group);
- mp = br_multicast_get_group(br, port, group, hash);
+ mp = br_multicast_get_group(br, p, group, hash);
switch (PTR_ERR(mp)) {
case 0:
break;
@@ -658,7 +715,8 @@ struct net_bridge_port_group *br_multicast_new_port_group(
struct net_bridge_port *port,
struct br_ip *group,
struct net_bridge_port_group __rcu *next,
- unsigned char flags)
+ unsigned char flags,
+ const unsigned char *src)
{
struct net_bridge_port_group *p;
@@ -673,16 +731,36 @@ struct net_bridge_port_group *br_multicast_new_port_group(
hlist_add_head(&p->mglist, &port->mglist);
setup_timer(&p->timer, br_multicast_port_group_expired,
(unsigned long)p);
+
+ if (src)
+ memcpy(p->eth_addr, src, ETH_ALEN);
+ else
+ memset(p->eth_addr, 0xff, ETH_ALEN);
+
return p;
}
+static bool br_port_group_equal(struct net_bridge_port_group *p,
+ struct net_bridge_port *port,
+ const unsigned char *src)
+{
+ if (p->port != port)
+ return false;
+
+ if (!(port->flags & BR_MULTICAST_TO_UNICAST))
+ return true;
+
+ return ether_addr_equal(src, p->eth_addr);
+}
+
static int br_multicast_add_group(struct net_bridge *br,
struct net_bridge_port *port,
- struct br_ip *group)
+ struct br_ip *group,
+ const unsigned char *src)
{
- struct net_bridge_mdb_entry *mp;
- struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_port_group *p;
+ struct net_bridge_mdb_entry *mp;
unsigned long now = jiffies;
int err;
@@ -705,13 +783,13 @@ static int br_multicast_add_group(struct net_bridge *br,
for (pp = &mp->ports;
(p = mlock_dereference(*pp, br)) != NULL;
pp = &p->next) {
- if (p->port == port)
+ if (br_port_group_equal(p, port, src))
goto found;
if ((unsigned long)p->port < (unsigned long)port)
break;
}
- p = br_multicast_new_port_group(port, group, *pp, 0);
+ p = br_multicast_new_port_group(port, group, *pp, 0, src);
if (unlikely(!p))
goto err;
rcu_assign_pointer(*pp, p);
@@ -730,7 +808,8 @@ err:
static int br_ip4_multicast_add_group(struct net_bridge *br,
struct net_bridge_port *port,
__be32 group,
- __u16 vid)
+ __u16 vid,
+ const unsigned char *src)
{
struct br_ip br_group;
@@ -741,14 +820,15 @@ static int br_ip4_multicast_add_group(struct net_bridge *br,
br_group.proto = htons(ETH_P_IP);
br_group.vid = vid;
- return br_multicast_add_group(br, port, &br_group);
+ return br_multicast_add_group(br, port, &br_group, src);
}
#if IS_ENABLED(CONFIG_IPV6)
static int br_ip6_multicast_add_group(struct net_bridge *br,
struct net_bridge_port *port,
const struct in6_addr *group,
- __u16 vid)
+ __u16 vid,
+ const unsigned char *src)
{
struct br_ip br_group;
@@ -759,7 +839,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br,
br_group.proto = htons(ETH_P_IPV6);
br_group.vid = vid;
- return br_multicast_add_group(br, port, &br_group);
+ return br_multicast_add_group(br, port, &br_group, src);
}
#endif
@@ -771,16 +851,10 @@ static void br_multicast_router_expired(unsigned long data)
spin_lock(&br->multicast_lock);
if (port->multicast_router == MDB_RTR_TYPE_DISABLED ||
port->multicast_router == MDB_RTR_TYPE_PERM ||
- timer_pending(&port->multicast_router_timer) ||
- hlist_unhashed(&port->rlist))
+ timer_pending(&port->multicast_router_timer))
goto out;
- hlist_del_init_rcu(&port->rlist);
- br_rtr_notify(br->dev, port, RTM_DELMDB);
- /* Don't allow timer refresh if the router expired */
- if (port->multicast_router == MDB_RTR_TYPE_TEMP)
- port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
-
+ __del_port_router(port);
out:
spin_unlock(&br->multicast_lock);
}
@@ -860,9 +934,9 @@ static void br_multicast_send_query(struct net_bridge *br,
struct net_bridge_port *port,
struct bridge_mcast_own_query *own_query)
{
- unsigned long time;
- struct br_ip br_group;
struct bridge_mcast_other_query *other_query = NULL;
+ struct br_ip br_group;
+ unsigned long time;
if (!netif_running(br->dev) || br->multicast_disabled ||
!br->multicast_querier)
@@ -929,6 +1003,18 @@ static void br_ip6_multicast_port_query_expired(unsigned long data)
}
#endif
+static void br_mc_disabled_update(struct net_device *dev, bool value)
+{
+ struct switchdev_attr attr = {
+ .orig_dev = dev,
+ .id = SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
+ .flags = SWITCHDEV_F_DEFER,
+ .u.mc_disabled = value,
+ };
+
+ switchdev_port_attr_set(dev, &attr);
+}
+
int br_multicast_add_port(struct net_bridge_port *port)
{
port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
@@ -941,6 +1027,8 @@ int br_multicast_add_port(struct net_bridge_port *port)
setup_timer(&port->ip6_own_query.timer,
br_ip6_multicast_port_query_expired, (unsigned long)port);
#endif
+ br_mc_disabled_update(port->dev, port->br->multicast_disabled);
+
port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats);
if (!port->mcast_stats)
return -ENOMEM;
@@ -1008,13 +1096,8 @@ void br_multicast_disable_port(struct net_bridge_port *port)
if (!(pg->flags & MDB_PG_FLAGS_PERMANENT))
br_multicast_del_pg(br, pg);
- if (!hlist_unhashed(&port->rlist)) {
- hlist_del_init_rcu(&port->rlist);
- br_rtr_notify(br->dev, port, RTM_DELMDB);
- /* Don't allow timer refresh if disabling */
- if (port->multicast_router == MDB_RTR_TYPE_TEMP)
- port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
- }
+ __del_port_router(port);
+
del_timer(&port->multicast_router_timer);
del_timer(&port->ip4_own_query.timer);
#if IS_ENABLED(CONFIG_IPV6)
@@ -1028,6 +1111,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
struct sk_buff *skb,
u16 vid)
{
+ const unsigned char *src;
struct igmpv3_report *ih;
struct igmpv3_grec *grec;
int i;
@@ -1068,12 +1152,14 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
continue;
}
+ src = eth_hdr(skb)->h_source;
if ((type == IGMPV3_CHANGE_TO_INCLUDE ||
type == IGMPV3_MODE_IS_INCLUDE) &&
ntohs(grec->grec_nsrcs) == 0) {
- br_ip4_multicast_leave_group(br, port, group, vid);
+ br_ip4_multicast_leave_group(br, port, group, vid, src);
} else {
- err = br_ip4_multicast_add_group(br, port, group, vid);
+ err = br_ip4_multicast_add_group(br, port, group, vid,
+ src);
if (err)
break;
}
@@ -1088,6 +1174,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
struct sk_buff *skb,
u16 vid)
{
+ const unsigned char *src;
struct icmp6hdr *icmp6h;
struct mld2_grec *grec;
int i;
@@ -1135,14 +1222,16 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
continue;
}
+ src = eth_hdr(skb)->h_source;
if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE ||
grec->grec_type == MLD2_MODE_IS_INCLUDE) &&
ntohs(*nsrcs) == 0) {
br_ip6_multicast_leave_group(br, port, &grec->grec_mca,
- vid);
+ vid, src);
} else {
err = br_ip6_multicast_add_group(br, port,
- &grec->grec_mca, vid);
+ &grec->grec_mca, vid,
+ src);
if (err)
break;
}
@@ -1228,6 +1317,19 @@ br_multicast_update_query_timer(struct net_bridge *br,
mod_timer(&query->timer, jiffies + br->multicast_querier_interval);
}
+static void br_port_mc_router_state_change(struct net_bridge_port *p,
+ bool is_mc_router)
+{
+ struct switchdev_attr attr = {
+ .orig_dev = p->dev,
+ .id = SWITCHDEV_ATTR_ID_PORT_MROUTER,
+ .flags = SWITCHDEV_F_DEFER,
+ .u.mrouter = is_mc_router,
+ };
+
+ switchdev_port_attr_set(p->dev, &attr);
+}
+
/*
* Add port to router_list
* list is maintained ordered by pointer value
@@ -1253,6 +1355,7 @@ static void br_multicast_add_router(struct net_bridge *br,
else
hlist_add_head_rcu(&port->rlist, &br->router_list);
br_rtr_notify(br->dev, port, RTM_NEWMDB);
+ br_port_mc_router_state_change(port, true);
}
static void br_multicast_mark_router(struct net_bridge *br,
@@ -1458,7 +1561,8 @@ br_multicast_leave_group(struct net_bridge *br,
struct net_bridge_port *port,
struct br_ip *group,
struct bridge_mcast_other_query *other_query,
- struct bridge_mcast_own_query *own_query)
+ struct bridge_mcast_own_query *own_query,
+ const unsigned char *src)
{
struct net_bridge_mdb_htable *mdb;
struct net_bridge_mdb_entry *mp;
@@ -1482,7 +1586,7 @@ br_multicast_leave_group(struct net_bridge *br,
for (pp = &mp->ports;
(p = mlock_dereference(*pp, br)) != NULL;
pp = &p->next) {
- if (p->port != port)
+ if (!br_port_group_equal(p, port, src))
continue;
rcu_assign_pointer(*pp, p->next);
@@ -1513,7 +1617,7 @@ br_multicast_leave_group(struct net_bridge *br,
for (p = mlock_dereference(mp->ports, br);
p != NULL;
p = mlock_dereference(p->next, br)) {
- if (p->port != port)
+ if (!br_port_group_equal(p, port, src))
continue;
if (!hlist_unhashed(&p->mglist) &&
@@ -1564,7 +1668,8 @@ out:
static void br_ip4_multicast_leave_group(struct net_bridge *br,
struct net_bridge_port *port,
__be32 group,
- __u16 vid)
+ __u16 vid,
+ const unsigned char *src)
{
struct br_ip br_group;
struct bridge_mcast_own_query *own_query;
@@ -1579,14 +1684,15 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br,
br_group.vid = vid;
br_multicast_leave_group(br, port, &br_group, &br->ip4_other_query,
- own_query);
+ own_query, src);
}
#if IS_ENABLED(CONFIG_IPV6)
static void br_ip6_multicast_leave_group(struct net_bridge *br,
struct net_bridge_port *port,
const struct in6_addr *group,
- __u16 vid)
+ __u16 vid,
+ const unsigned char *src)
{
struct br_ip br_group;
struct bridge_mcast_own_query *own_query;
@@ -1601,7 +1707,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
br_group.vid = vid;
br_multicast_leave_group(br, port, &br_group, &br->ip6_other_query,
- own_query);
+ own_query, src);
}
#endif
@@ -1638,20 +1744,40 @@ static void br_multicast_err_count(const struct net_bridge *br,
u64_stats_update_end(&pstats->syncp);
}
+static void br_multicast_pim(struct net_bridge *br,
+ struct net_bridge_port *port,
+ const struct sk_buff *skb)
+{
+ unsigned int offset = skb_transport_offset(skb);
+ struct pimhdr *pimhdr, _pimhdr;
+
+ pimhdr = skb_header_pointer(skb, offset, sizeof(_pimhdr), &_pimhdr);
+ if (!pimhdr || pim_hdr_version(pimhdr) != PIM_VERSION ||
+ pim_hdr_type(pimhdr) != PIM_TYPE_HELLO)
+ return;
+
+ br_multicast_mark_router(br, port);
+}
+
static int br_multicast_ipv4_rcv(struct net_bridge *br,
struct net_bridge_port *port,
struct sk_buff *skb,
u16 vid)
{
struct sk_buff *skb_trimmed = NULL;
+ const unsigned char *src;
struct igmphdr *ih;
int err;
err = ip_mc_check_igmp(skb, &skb_trimmed);
if (err == -ENOMSG) {
- if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr))
+ if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr)) {
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
+ } else if (pim_ipv4_all_pim_routers(ip_hdr(skb)->daddr)) {
+ if (ip_hdr(skb)->protocol == IPPROTO_PIM)
+ br_multicast_pim(br, port, skb);
+ }
return 0;
} else if (err < 0) {
br_multicast_err_count(br, port, skb->protocol);
@@ -1659,13 +1785,14 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
}
ih = igmp_hdr(skb);
+ src = eth_hdr(skb)->h_source;
BR_INPUT_SKB_CB(skb)->igmp = ih->type;
switch (ih->type) {
case IGMP_HOST_MEMBERSHIP_REPORT:
case IGMPV2_HOST_MEMBERSHIP_REPORT:
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
- err = br_ip4_multicast_add_group(br, port, ih->group, vid);
+ err = br_ip4_multicast_add_group(br, port, ih->group, vid, src);
break;
case IGMPV3_HOST_MEMBERSHIP_REPORT:
err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid);
@@ -1674,7 +1801,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
err = br_ip4_multicast_query(br, port, skb_trimmed, vid);
break;
case IGMP_HOST_LEAVE_MESSAGE:
- br_ip4_multicast_leave_group(br, port, ih->group, vid);
+ br_ip4_multicast_leave_group(br, port, ih->group, vid, src);
break;
}
@@ -1694,6 +1821,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
u16 vid)
{
struct sk_buff *skb_trimmed = NULL;
+ const unsigned char *src;
struct mld_msg *mld;
int err;
@@ -1713,8 +1841,10 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
switch (mld->mld_type) {
case ICMPV6_MGM_REPORT:
+ src = eth_hdr(skb)->h_source;
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
- err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid);
+ err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid,
+ src);
break;
case ICMPV6_MLD2_REPORT:
err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid);
@@ -1723,7 +1853,8 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
err = br_ip6_multicast_query(br, port, skb_trimmed, vid);
break;
case ICMPV6_MGM_REDUCTION:
- br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid);
+ src = eth_hdr(skb)->h_source;
+ br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid, src);
break;
}
@@ -1811,7 +1942,9 @@ void br_multicast_init(struct net_bridge *br)
br->ip4_other_query.delay_time = 0;
br->ip4_querier.port = NULL;
+ br->multicast_igmp_version = 2;
#if IS_ENABLED(CONFIG_IPV6)
+ br->multicast_mld_version = 1;
br->ip6_other_query.delay_time = 0;
br->ip6_querier.port = NULL;
#endif
@@ -1898,8 +2031,6 @@ void br_multicast_dev_del(struct net_bridge *br)
out:
spin_unlock_bh(&br->multicast_lock);
-
- free_percpu(br->mcast_stats);
}
int br_multicast_set_router(struct net_bridge *br, unsigned long val)
@@ -1930,6 +2061,11 @@ static void __del_port_router(struct net_bridge_port *p)
return;
hlist_del_init_rcu(&p->rlist);
br_rtr_notify(p->br->dev, p, RTM_DELMDB);
+ br_port_mc_router_state_change(p, false);
+
+ /* don't allow timer refresh */
+ if (p->multicast_router == MDB_RTR_TYPE_TEMP)
+ p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
}
int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
@@ -2007,6 +2143,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val)
if (br->multicast_disabled == !val)
goto unlock;
+ br_mc_disabled_update(br->dev, !val);
br->multicast_disabled = !val;
if (br->multicast_disabled)
goto unlock;
@@ -2112,6 +2249,44 @@ unlock:
return err;
}
+int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val)
+{
+ /* Currently we support only version 2 and 3 */
+ switch (val) {
+ case 2:
+ case 3:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ spin_lock_bh(&br->multicast_lock);
+ br->multicast_igmp_version = val;
+ spin_unlock_bh(&br->multicast_lock);
+
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val)
+{
+ /* Currently we support version 1 and 2 */
+ switch (val) {
+ case 1:
+ case 2:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ spin_lock_bh(&br->multicast_lock);
+ br->multicast_mld_version = val;
+ spin_unlock_bh(&br->multicast_lock);
+
+ return 0;
+}
+#endif
+
/**
* br_multicast_list_adjacent - Returns snooped multicast addresses
* @dev: The bridge port adjacent to which to retrieve addresses
@@ -2354,6 +2529,11 @@ int br_multicast_init_stats(struct net_bridge *br)
return 0;
}
+void br_multicast_uninit_stats(struct net_bridge *br)
+{
+ free_percpu(br->mcast_stats);
+}
+
static void mcast_stats_add_dir(u64 *dst, u64 *src)
{
dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX];
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 2fe9345c1407..1f1e62095464 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -40,13 +40,13 @@
#include <net/netfilter/br_netfilter.h>
#include <net/netns/generic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "br_private.h"
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
-static int brnf_net_id __read_mostly;
+static unsigned int brnf_net_id __read_mostly;
struct brnf_net {
bool enabled;
@@ -399,7 +399,7 @@ bridged_dnat:
br_nf_hook_thresh(NF_BR_PRE_ROUTING,
net, sk, skb, skb->dev,
NULL,
- br_nf_pre_routing_finish);
+ br_nf_pre_routing_finish_bridge);
return 0;
}
ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
@@ -521,21 +521,6 @@ static unsigned int br_nf_pre_routing(void *priv,
}
-/* PF_BRIDGE/LOCAL_IN ************************************************/
-/* The packet is locally destined, which requires a real
- * dst_entry, so detach the fake one. On the way up, the
- * packet would pass through PRE_ROUTING again (which already
- * took place when the packet entered the bridge), but we
- * register an IPv4 PRE_ROUTING 'sabotage' hook that will
- * prevent this from happening. */
-static unsigned int br_nf_local_in(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- br_drop_fake_rtable(skb);
- return NF_ACCEPT;
-}
-
/* PF_BRIDGE/FORWARD *************************************************/
static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
@@ -561,8 +546,8 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff
}
nf_bridge_push_encap_header(skb);
- NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, net, sk, skb,
- in, skb->dev, br_forward_finish, 1);
+ br_nf_hook_thresh(NF_BR_FORWARD, net, sk, skb, in, skb->dev,
+ br_forward_finish);
return 0;
}
@@ -721,18 +706,20 @@ static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb)
static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct nf_bridge_info *nf_bridge;
- unsigned int mtu_reserved;
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ unsigned int mtu, mtu_reserved;
mtu_reserved = nf_bridge_mtu_reduction(skb);
+ mtu = skb->dev->mtu;
+
+ if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu)
+ mtu = nf_bridge->frag_max_size;
- if (skb_is_gso(skb) || skb->len + mtu_reserved <= skb->dev->mtu) {
+ if (skb_is_gso(skb) || skb->len + mtu_reserved <= mtu) {
nf_bridge_info_free(skb);
return br_dev_queue_push_xmit(net, sk, skb);
}
- nf_bridge = nf_bridge_info_get(skb);
-
/* This is wrong! We should preserve the original fragment
* boundaries by preserving frag_list rather than refragmenting.
*/
@@ -845,8 +832,10 @@ static unsigned int ip_sabotage_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- if (skb->nf_bridge && !skb->nf_bridge->in_prerouting)
- return NF_STOP;
+ if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) {
+ state->okfn(state->net, state->sk, skb);
+ return NF_STOLEN;
+ }
return NF_ACCEPT;
}
@@ -906,12 +895,6 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = {
.priority = NF_BR_PRI_BRNF,
},
{
- .hook = br_nf_local_in,
- .pf = NFPROTO_BRIDGE,
- .hooknum = NF_BR_LOCAL_IN,
- .priority = NF_BR_PRI_BRNF,
- },
- {
.hook = br_nf_forward_ip,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_FORWARD,
@@ -1006,20 +989,20 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net,
struct nf_hook_state state;
int ret;
- elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]);
-
- while (elem && (elem->ops.priority <= NF_BR_PRI_BRNF))
- elem = rcu_dereference(elem->next);
+ for (elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]);
+ elem && nf_hook_entry_priority(elem) <= NF_BR_PRI_BRNF;
+ elem = rcu_dereference(elem->next))
+ ;
if (!elem)
return okfn(net, sk, skb);
/* We may already have this, but read-locks nest anyway */
rcu_read_lock();
- nf_hook_state_init(&state, elem, hook, NF_BR_PRI_BRNF + 1,
- NFPROTO_BRIDGE, indev, outdev, sk, net, okfn);
+ nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev,
+ sk, net, okfn);
- ret = nf_hook_slow(skb, &state);
+ ret = nf_hook_slow(skb, &state, elem);
rcu_read_unlock();
if (ret == 1)
ret = okfn(net, sk, skb);
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 5989661c659f..96c072e71ea2 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -38,7 +38,7 @@
#include <net/route.h>
#include <net/netfilter/br_netfilter.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "br_private.h"
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index e99037c6f7b7..225ef7d53701 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -20,6 +20,7 @@
#include "br_private.h"
#include "br_private_stp.h"
+#include "br_private_tunnel.h"
static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg,
u32 filter_mask)
@@ -95,9 +96,10 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev,
u32 filter_mask)
{
struct net_bridge_vlan_group *vg = NULL;
- struct net_bridge_port *p;
+ struct net_bridge_port *p = NULL;
struct net_bridge *br;
int num_vlan_infos;
+ size_t vinfo_sz = 0;
rcu_read_lock();
if (br_port_exists(dev)) {
@@ -110,8 +112,13 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev,
num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask);
rcu_read_unlock();
+ if (p && (p->flags & BR_VLAN_TUNNEL))
+ vinfo_sz += br_get_vlan_tunnel_info_size(vg);
+
/* Each VLAN is returned in bridge_vlan_info along with flags */
- return num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info));
+ vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info));
+
+ return vinfo_sz;
}
static inline size_t br_port_info_size(void)
@@ -123,10 +130,12 @@ static inline size_t br_port_info_size(void)
+ nla_total_size(1) /* IFLA_BRPORT_GUARD */
+ nla_total_size(1) /* IFLA_BRPORT_PROTECT */
+ nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */
+ + nla_total_size(1) /* IFLA_BRPORT_MCAST_TO_UCAST */
+ nla_total_size(1) /* IFLA_BRPORT_LEARNING */
+ nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */
+ nla_total_size(1) /* IFLA_BRPORT_PROXYARP */
+ nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */
+ + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */
+ nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */
+ nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */
+ nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */
@@ -173,6 +182,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
!!(p->flags & BR_ROOT_BLOCK)) ||
nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE,
!!(p->flags & BR_MULTICAST_FAST_LEAVE)) ||
+ nla_put_u8(skb, IFLA_BRPORT_MCAST_TO_UCAST,
+ !!(p->flags & BR_MULTICAST_TO_UNICAST)) ||
nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) ||
nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD,
!!(p->flags & BR_FLOOD)) ||
@@ -191,7 +202,9 @@ static int br_port_fill_attrs(struct sk_buff *skb,
nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) ||
nla_put_u8(skb, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK,
p->topology_change_ack) ||
- nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending))
+ nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) ||
+ nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags &
+ BR_VLAN_TUNNEL)))
return -EMSGSIZE;
timerval = br_timer_value(&p->message_age_timer);
@@ -414,6 +427,9 @@ static int br_fill_ifinfo(struct sk_buff *skb,
err = br_fill_ifvlaninfo_compressed(skb, vg);
else
err = br_fill_ifvlaninfo(skb, vg);
+
+ if (port && (port->flags & BR_VLAN_TUNNEL))
+ err = br_fill_vlan_tunnel_info(skb, vg);
rcu_read_unlock();
if (err)
goto nla_put_failure;
@@ -514,60 +530,88 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p,
return err;
}
+static int br_process_vlan_info(struct net_bridge *br,
+ struct net_bridge_port *p, int cmd,
+ struct bridge_vlan_info *vinfo_curr,
+ struct bridge_vlan_info **vinfo_last)
+{
+ if (!vinfo_curr->vid || vinfo_curr->vid >= VLAN_VID_MASK)
+ return -EINVAL;
+
+ if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
+ /* check if we are already processing a range */
+ if (*vinfo_last)
+ return -EINVAL;
+ *vinfo_last = vinfo_curr;
+ /* don't allow range of pvids */
+ if ((*vinfo_last)->flags & BRIDGE_VLAN_INFO_PVID)
+ return -EINVAL;
+ return 0;
+ }
+
+ if (*vinfo_last) {
+ struct bridge_vlan_info tmp_vinfo;
+ int v, err;
+
+ if (!(vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END))
+ return -EINVAL;
+
+ if (vinfo_curr->vid <= (*vinfo_last)->vid)
+ return -EINVAL;
+
+ memcpy(&tmp_vinfo, *vinfo_last,
+ sizeof(struct bridge_vlan_info));
+ for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) {
+ tmp_vinfo.vid = v;
+ err = br_vlan_info(br, p, cmd, &tmp_vinfo);
+ if (err)
+ break;
+ }
+ *vinfo_last = NULL;
+
+ return 0;
+ }
+
+ return br_vlan_info(br, p, cmd, vinfo_curr);
+}
+
static int br_afspec(struct net_bridge *br,
struct net_bridge_port *p,
struct nlattr *af_spec,
int cmd)
{
- struct bridge_vlan_info *vinfo_start = NULL;
- struct bridge_vlan_info *vinfo = NULL;
+ struct bridge_vlan_info *vinfo_curr = NULL;
+ struct bridge_vlan_info *vinfo_last = NULL;
struct nlattr *attr;
- int err = 0;
- int rem;
+ struct vtunnel_info tinfo_last = {};
+ struct vtunnel_info tinfo_curr = {};
+ int err = 0, rem;
nla_for_each_nested(attr, af_spec, rem) {
- if (nla_type(attr) != IFLA_BRIDGE_VLAN_INFO)
- continue;
- if (nla_len(attr) != sizeof(struct bridge_vlan_info))
- return -EINVAL;
- vinfo = nla_data(attr);
- if (!vinfo->vid || vinfo->vid >= VLAN_VID_MASK)
- return -EINVAL;
- if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
- if (vinfo_start)
- return -EINVAL;
- vinfo_start = vinfo;
- /* don't allow range of pvids */
- if (vinfo_start->flags & BRIDGE_VLAN_INFO_PVID)
+ err = 0;
+ switch (nla_type(attr)) {
+ case IFLA_BRIDGE_VLAN_TUNNEL_INFO:
+ if (!(p->flags & BR_VLAN_TUNNEL))
return -EINVAL;
- continue;
- }
-
- if (vinfo_start) {
- struct bridge_vlan_info tmp_vinfo;
- int v;
-
- if (!(vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END))
- return -EINVAL;
-
- if (vinfo->vid <= vinfo_start->vid)
+ err = br_parse_vlan_tunnel_info(attr, &tinfo_curr);
+ if (err)
+ return err;
+ err = br_process_vlan_tunnel_info(br, p, cmd,
+ &tinfo_curr,
+ &tinfo_last);
+ if (err)
+ return err;
+ break;
+ case IFLA_BRIDGE_VLAN_INFO:
+ if (nla_len(attr) != sizeof(struct bridge_vlan_info))
return -EINVAL;
-
- memcpy(&tmp_vinfo, vinfo_start,
- sizeof(struct bridge_vlan_info));
-
- for (v = vinfo_start->vid; v <= vinfo->vid; v++) {
- tmp_vinfo.vid = v;
- err = br_vlan_info(br, p, cmd, &tmp_vinfo);
- if (err)
- break;
- }
- vinfo_start = NULL;
- } else {
- err = br_vlan_info(br, p, cmd, vinfo);
- }
- if (err)
+ vinfo_curr = nla_data(attr);
+ err = br_process_vlan_info(br, p, cmd, vinfo_curr,
+ &vinfo_last);
+ if (err)
+ return err;
break;
+ }
}
return err;
@@ -586,6 +630,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
[IFLA_BRPORT_PROXYARP] = { .type = NLA_U8 },
[IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 },
[IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 },
+ [IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 },
};
/* Change the state of the port and notify spanning tree */
@@ -626,8 +671,9 @@ static void br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[],
/* Process bridge protocol info on port */
static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
{
- int err;
unsigned long old_flags = p->flags;
+ bool br_vlan_tunnel_old = false;
+ int err;
br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE);
br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD);
@@ -636,9 +682,15 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING);
br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD);
+ br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, BR_MULTICAST_TO_UNICAST);
br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP);
br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI);
+ br_vlan_tunnel_old = (p->flags & BR_VLAN_TUNNEL) ? true : false;
+ br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL);
+ if (br_vlan_tunnel_old && !(p->flags & BR_VLAN_TUNNEL))
+ nbp_vlan_tunnel_info_flush(p);
+
if (tb[IFLA_BRPORT_COST]) {
err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST]));
if (err)
@@ -781,20 +833,6 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[])
return 0;
}
-static int br_dev_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[])
-{
- struct net_bridge *br = netdev_priv(dev);
-
- if (tb[IFLA_ADDRESS]) {
- spin_lock_bh(&br->lock);
- br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS]));
- spin_unlock_bh(&br->lock);
- }
-
- return register_netdevice(dev);
-}
-
static int br_port_slave_changelink(struct net_device *brdev,
struct net_device *dev,
struct nlattr *tb[],
@@ -858,6 +896,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
[IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 },
[IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 },
[IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 },
};
static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -1069,6 +1109,26 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]);
br->multicast_stats_enabled = !!mcast_stats;
}
+
+ if (data[IFLA_BR_MCAST_IGMP_VERSION]) {
+ __u8 igmp_version;
+
+ igmp_version = nla_get_u8(data[IFLA_BR_MCAST_IGMP_VERSION]);
+ err = br_multicast_set_igmp_version(br, igmp_version);
+ if (err)
+ return err;
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (data[IFLA_BR_MCAST_MLD_VERSION]) {
+ __u8 mld_version;
+
+ mld_version = nla_get_u8(data[IFLA_BR_MCAST_MLD_VERSION]);
+ err = br_multicast_set_mld_version(br, mld_version);
+ if (err)
+ return err;
+ }
+#endif
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
if (data[IFLA_BR_NF_CALL_IPTABLES]) {
@@ -1093,6 +1153,28 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
return 0;
}
+static int br_dev_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct net_bridge *br = netdev_priv(dev);
+ int err;
+
+ if (tb[IFLA_ADDRESS]) {
+ spin_lock_bh(&br->lock);
+ br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS]));
+ spin_unlock_bh(&br->lock);
+ }
+
+ err = register_netdevice(dev);
+ if (err)
+ return err;
+
+ err = br_changelink(dev, tb, data);
+ if (err)
+ unregister_netdevice(dev);
+ return err;
+}
+
static size_t br_get_size(const struct net_device *brdev)
{
return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */
@@ -1135,6 +1217,8 @@ static size_t br_get_size(const struct net_device *brdev)
nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */
nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */
nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_IGMP_VERSION */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_MLD_VERSION */
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */
@@ -1166,7 +1250,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval,
IFLA_BR_PAD))
return -EMSGSIZE;
- clockval = br_timer_value(&br->gc_timer);
+ clockval = br_timer_value(&br->gc_work.timer);
if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD))
return -EMSGSIZE;
@@ -1210,9 +1294,15 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT,
br->multicast_last_member_count) ||
nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT,
- br->multicast_startup_query_count))
+ br->multicast_startup_query_count) ||
+ nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION,
+ br->multicast_igmp_version))
return -EMSGSIZE;
-
+#if IS_ENABLED(CONFIG_IPV6)
+ if (nla_put_u8(skb, IFLA_BR_MCAST_MLD_VERSION,
+ br->multicast_mld_version))
+ return -EMSGSIZE;
+#endif
clockval = jiffies_to_clock_t(br->multicast_last_member_interval);
if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval,
IFLA_BR_PAD))
diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c
new file mode 100644
index 000000000000..c913491495ab
--- /dev/null
+++ b/net/bridge/br_netlink_tunnel.c
@@ -0,0 +1,294 @@
+/*
+ * Bridge per vlan tunnel port dst_metadata netlink control interface
+ *
+ * Authors:
+ * Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/etherdevice.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <uapi/linux/if_bridge.h>
+#include <net/dst_metadata.h>
+
+#include "br_private.h"
+#include "br_private_tunnel.h"
+
+static size_t __get_vlan_tinfo_size(void)
+{
+ return nla_total_size(0) + /* nest IFLA_BRIDGE_VLAN_TUNNEL_INFO */
+ nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_VLAN_TUNNEL_ID */
+ nla_total_size(sizeof(u16)) + /* IFLA_BRIDGE_VLAN_TUNNEL_VID */
+ nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_VLAN_TUNNEL_FLAGS */
+}
+
+static bool vlan_tunid_inrange(struct net_bridge_vlan *v_curr,
+ struct net_bridge_vlan *v_last)
+{
+ __be32 tunid_curr = tunnel_id_to_key32(v_curr->tinfo.tunnel_id);
+ __be32 tunid_last = tunnel_id_to_key32(v_last->tinfo.tunnel_id);
+
+ return (be32_to_cpu(tunid_curr) - be32_to_cpu(tunid_last)) == 1;
+}
+
+static int __get_num_vlan_tunnel_infos(struct net_bridge_vlan_group *vg)
+{
+ struct net_bridge_vlan *v, *vtbegin = NULL, *vtend = NULL;
+ int num_tinfos = 0;
+
+ /* Count number of vlan infos */
+ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+ /* only a context, bridge vlan not activated */
+ if (!br_vlan_should_use(v) || !v->tinfo.tunnel_id)
+ continue;
+
+ if (!vtbegin) {
+ goto initvars;
+ } else if ((v->vid - vtend->vid) == 1 &&
+ vlan_tunid_inrange(v, vtend)) {
+ vtend = v;
+ continue;
+ } else {
+ if ((vtend->vid - vtbegin->vid) > 0)
+ num_tinfos += 2;
+ else
+ num_tinfos += 1;
+ }
+initvars:
+ vtbegin = v;
+ vtend = v;
+ }
+
+ if (vtbegin && vtend) {
+ if ((vtend->vid - vtbegin->vid) > 0)
+ num_tinfos += 2;
+ else
+ num_tinfos += 1;
+ }
+
+ return num_tinfos;
+}
+
+int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg)
+{
+ int num_tinfos;
+
+ if (!vg)
+ return 0;
+
+ rcu_read_lock();
+ num_tinfos = __get_num_vlan_tunnel_infos(vg);
+ rcu_read_unlock();
+
+ return num_tinfos * __get_vlan_tinfo_size();
+}
+
+static int br_fill_vlan_tinfo(struct sk_buff *skb, u16 vid,
+ __be64 tunnel_id, u16 flags)
+{
+ __be32 tid = tunnel_id_to_key32(tunnel_id);
+ struct nlattr *tmap;
+
+ tmap = nla_nest_start(skb, IFLA_BRIDGE_VLAN_TUNNEL_INFO);
+ if (!tmap)
+ return -EMSGSIZE;
+ if (nla_put_u32(skb, IFLA_BRIDGE_VLAN_TUNNEL_ID,
+ be32_to_cpu(tid)))
+ goto nla_put_failure;
+ if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_VID,
+ vid))
+ goto nla_put_failure;
+ if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_FLAGS,
+ flags))
+ goto nla_put_failure;
+ nla_nest_end(skb, tmap);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, tmap);
+
+ return -EMSGSIZE;
+}
+
+static int br_fill_vlan_tinfo_range(struct sk_buff *skb,
+ struct net_bridge_vlan *vtbegin,
+ struct net_bridge_vlan *vtend)
+{
+ int err;
+
+ if (vtend && (vtend->vid - vtbegin->vid) > 0) {
+ /* add range to skb */
+ err = br_fill_vlan_tinfo(skb, vtbegin->vid,
+ vtbegin->tinfo.tunnel_id,
+ BRIDGE_VLAN_INFO_RANGE_BEGIN);
+ if (err)
+ return err;
+
+ err = br_fill_vlan_tinfo(skb, vtend->vid,
+ vtend->tinfo.tunnel_id,
+ BRIDGE_VLAN_INFO_RANGE_END);
+ if (err)
+ return err;
+ } else {
+ err = br_fill_vlan_tinfo(skb, vtbegin->vid,
+ vtbegin->tinfo.tunnel_id,
+ 0);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+int br_fill_vlan_tunnel_info(struct sk_buff *skb,
+ struct net_bridge_vlan_group *vg)
+{
+ struct net_bridge_vlan *vtbegin = NULL;
+ struct net_bridge_vlan *vtend = NULL;
+ struct net_bridge_vlan *v;
+ int err;
+
+ /* Count number of vlan infos */
+ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+ /* only a context, bridge vlan not activated */
+ if (!br_vlan_should_use(v))
+ continue;
+
+ if (!v->tinfo.tunnel_dst)
+ continue;
+
+ if (!vtbegin) {
+ goto initvars;
+ } else if ((v->vid - vtend->vid) == 1 &&
+ vlan_tunid_inrange(v, vtend)) {
+ vtend = v;
+ continue;
+ } else {
+ err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend);
+ if (err)
+ return err;
+ }
+initvars:
+ vtbegin = v;
+ vtend = v;
+ }
+
+ if (vtbegin) {
+ err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1] = {
+ [IFLA_BRIDGE_VLAN_TUNNEL_ID] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_VLAN_TUNNEL_VID] = { .type = NLA_U16 },
+ [IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = { .type = NLA_U16 },
+};
+
+static int br_vlan_tunnel_info(struct net_bridge_port *p, int cmd,
+ u16 vid, u32 tun_id)
+{
+ int err = 0;
+
+ if (!p)
+ return -EINVAL;
+
+ switch (cmd) {
+ case RTM_SETLINK:
+ err = nbp_vlan_tunnel_info_add(p, vid, tun_id);
+ break;
+ case RTM_DELLINK:
+ nbp_vlan_tunnel_info_delete(p, vid);
+ break;
+ }
+
+ return err;
+}
+
+int br_parse_vlan_tunnel_info(struct nlattr *attr,
+ struct vtunnel_info *tinfo)
+{
+ struct nlattr *tb[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1];
+ u32 tun_id;
+ u16 vid, flags = 0;
+ int err;
+
+ memset(tinfo, 0, sizeof(*tinfo));
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_VLAN_TUNNEL_MAX,
+ attr, vlan_tunnel_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_VLAN_TUNNEL_ID] ||
+ !tb[IFLA_BRIDGE_VLAN_TUNNEL_VID])
+ return -EINVAL;
+
+ tun_id = nla_get_u32(tb[IFLA_BRIDGE_VLAN_TUNNEL_ID]);
+ vid = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_VID]);
+ if (vid >= VLAN_VID_MASK)
+ return -ERANGE;
+
+ if (tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS])
+ flags = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS]);
+
+ tinfo->tunid = tun_id;
+ tinfo->vid = vid;
+ tinfo->flags = flags;
+
+ return 0;
+}
+
+int br_process_vlan_tunnel_info(struct net_bridge *br,
+ struct net_bridge_port *p, int cmd,
+ struct vtunnel_info *tinfo_curr,
+ struct vtunnel_info *tinfo_last)
+{
+ int err;
+
+ if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
+ if (tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN)
+ return -EINVAL;
+ memcpy(tinfo_last, tinfo_curr, sizeof(struct vtunnel_info));
+ } else if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END) {
+ int t, v;
+
+ if (!(tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN))
+ return -EINVAL;
+ if ((tinfo_curr->vid - tinfo_last->vid) !=
+ (tinfo_curr->tunid - tinfo_last->tunid))
+ return -EINVAL;
+ t = tinfo_last->tunid;
+ for (v = tinfo_last->vid; v <= tinfo_curr->vid; v++) {
+ err = br_vlan_tunnel_info(p, cmd, v, t);
+ if (err)
+ return err;
+ t++;
+ }
+ memset(tinfo_last, 0, sizeof(struct vtunnel_info));
+ memset(tinfo_curr, 0, sizeof(struct vtunnel_info));
+ } else {
+ if (tinfo_last->flags)
+ return -EINVAL;
+ err = br_vlan_tunnel_info(p, cmd, tinfo_curr->vid,
+ tinfo_curr->tunid);
+ if (err)
+ return err;
+ memset(tinfo_last, 0, sizeof(struct vtunnel_info));
+ memset(tinfo_curr, 0, sizeof(struct vtunnel_info));
+ }
+
+ return 0;
+}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1b63177e0ccd..0d177280aa84 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -91,6 +91,11 @@ struct br_vlan_stats {
struct u64_stats_sync syncp;
};
+struct br_tunnel_info {
+ __be64 tunnel_id;
+ struct metadata_dst *tunnel_dst;
+};
+
/**
* struct net_bridge_vlan - per-vlan entry
*
@@ -113,6 +118,7 @@ struct br_vlan_stats {
*/
struct net_bridge_vlan {
struct rhash_head vnode;
+ struct rhash_head tnode;
u16 vid;
u16 flags;
struct br_vlan_stats __percpu *stats;
@@ -124,6 +130,9 @@ struct net_bridge_vlan {
atomic_t refcnt;
struct net_bridge_vlan *brvlan;
};
+
+ struct br_tunnel_info tinfo;
+
struct list_head vlist;
struct rcu_head rcu;
@@ -145,24 +154,27 @@ struct net_bridge_vlan {
*/
struct net_bridge_vlan_group {
struct rhashtable vlan_hash;
+ struct rhashtable tunnel_hash;
struct list_head vlan_list;
u16 num_vlans;
u16 pvid;
};
-struct net_bridge_fdb_entry
-{
+struct net_bridge_fdb_entry {
struct hlist_node hlist;
struct net_bridge_port *dst;
- unsigned long updated;
- unsigned long used;
mac_addr addr;
__u16 vlan_id;
unsigned char is_local:1,
is_static:1,
added_by_user:1,
added_by_external_learn:1;
+
+ /* write-heavy members should not affect lookups */
+ unsigned long updated ____cacheline_aligned_in_smp;
+ unsigned long used;
+
struct rcu_head rcu;
};
@@ -177,6 +189,7 @@ struct net_bridge_port_group {
struct timer_list timer;
struct br_ip addr;
unsigned char flags;
+ unsigned char eth_addr[ETH_ALEN];
};
struct net_bridge_mdb_entry
@@ -201,12 +214,16 @@ struct net_bridge_mdb_htable
u32 ver;
};
-struct net_bridge_port
-{
+struct net_bridge_port {
struct net_bridge *br;
struct net_device *dev;
struct list_head list;
+ unsigned long flags;
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ struct net_bridge_vlan_group __rcu *vlgrp;
+#endif
+
/* STP */
u8 priority;
u8 state;
@@ -227,8 +244,6 @@ struct net_bridge_port
struct kobject kobj;
struct rcu_head rcu;
- unsigned long flags;
-
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
struct bridge_mcast_own_query ip4_own_query;
#if IS_ENABLED(CONFIG_IPV6)
@@ -248,9 +263,6 @@ struct net_bridge_port
#ifdef CONFIG_NET_POLL_CONTROLLER
struct netpoll *np;
#endif
-#ifdef CONFIG_BRIDGE_VLAN_FILTERING
- struct net_bridge_vlan_group __rcu *vlgrp;
-#endif
#ifdef CONFIG_NET_SWITCHDEV
int offload_fwd_mark;
#endif
@@ -272,14 +284,21 @@ static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device *
rtnl_dereference(dev->rx_handler_data) : NULL;
}
-struct net_bridge
-{
+struct net_bridge {
spinlock_t lock;
+ spinlock_t hash_lock;
struct list_head port_list;
struct net_device *dev;
-
struct pcpu_sw_netstats __percpu *stats;
- spinlock_t hash_lock;
+ /* These fields are accessed on each packet */
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ u8 vlan_enabled;
+ u8 vlan_stats_enabled;
+ __be16 vlan_proto;
+ u16 default_pvid;
+ struct net_bridge_vlan_group __rcu *vlgrp;
+#endif
+
struct hlist_head hash[BR_HASH_SIZE];
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
union {
@@ -297,17 +316,20 @@ struct net_bridge
bridge_id designated_root;
bridge_id bridge_id;
u32 root_path_cost;
+ unsigned char topology_change;
+ unsigned char topology_change_detected;
+ u16 root_port;
unsigned long max_age;
unsigned long hello_time;
unsigned long forward_delay;
- unsigned long bridge_max_age;
unsigned long ageing_time;
+ unsigned long bridge_max_age;
unsigned long bridge_hello_time;
unsigned long bridge_forward_delay;
+ unsigned long bridge_ageing_time;
u8 group_addr[ETH_ALEN];
bool group_addr_set;
- u16 root_port;
enum {
BR_NO_STP, /* no spanning tree */
@@ -315,9 +337,6 @@ struct net_bridge
BR_USER_STP, /* new RSTP in userspace */
} stp_enabled;
- unsigned char topology_change;
- unsigned char topology_change_detected;
-
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
unsigned char multicast_router;
@@ -333,6 +352,8 @@ struct net_bridge
u32 multicast_last_member_count;
u32 multicast_startup_query_count;
+ u8 multicast_igmp_version;
+
unsigned long multicast_last_member_interval;
unsigned long multicast_membership_interval;
unsigned long multicast_querier_interval;
@@ -353,27 +374,20 @@ struct net_bridge
struct bridge_mcast_other_query ip6_other_query;
struct bridge_mcast_own_query ip6_own_query;
struct bridge_mcast_querier ip6_querier;
+ u8 multicast_mld_version;
#endif /* IS_ENABLED(CONFIG_IPV6) */
#endif
struct timer_list hello_timer;
struct timer_list tcn_timer;
struct timer_list topology_change_timer;
- struct timer_list gc_timer;
+ struct delayed_work gc_work;
struct kobject *ifobj;
u32 auto_cnt;
#ifdef CONFIG_NET_SWITCHDEV
int offload_fwd_mark;
#endif
-
-#ifdef CONFIG_BRIDGE_VLAN_FILTERING
- struct net_bridge_vlan_group __rcu *vlgrp;
- u8 vlan_enabled;
- u8 vlan_stats_enabled;
- __be16 vlan_proto;
- u16 default_pvid;
-#endif
};
struct br_input_skb_cb {
@@ -490,11 +504,12 @@ void br_fdb_find_delete_local(struct net_bridge *br,
const unsigned char *addr, u16 vid);
void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr);
void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr);
-void br_fdb_cleanup(unsigned long arg);
+void br_fdb_cleanup(struct work_struct *work);
void br_fdb_delete_by_port(struct net_bridge *br,
const struct net_bridge_port *p, u16 vid, int do_all);
-struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
- const unsigned char *addr, __u16 vid);
+struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
+ const unsigned char *addr,
+ __u16 vid);
int br_fdb_test_addr(struct net_device *dev, unsigned char *addr);
int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count,
unsigned long off);
@@ -582,6 +597,10 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val);
int br_multicast_toggle(struct net_bridge *br, unsigned long val);
int br_multicast_set_querier(struct net_bridge *br, unsigned long val);
int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val);
+int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val);
+#if IS_ENABLED(CONFIG_IPV6)
+int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val);
+#endif
struct net_bridge_mdb_entry *
br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, struct br_ip *dst);
struct net_bridge_mdb_entry *
@@ -591,7 +610,7 @@ void br_multicast_free_pg(struct rcu_head *head);
struct net_bridge_port_group *
br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
struct net_bridge_port_group __rcu *next,
- unsigned char flags);
+ unsigned char flags, const unsigned char *src);
void br_mdb_init(void);
void br_mdb_uninit(void);
void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
@@ -601,6 +620,7 @@ void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p,
const struct sk_buff *skb, u8 type, u8 dir);
int br_multicast_init_stats(struct net_bridge *br);
+void br_multicast_uninit_stats(struct net_bridge *br);
void br_multicast_get_stats(const struct net_bridge *br,
const struct net_bridge_port *p,
struct br_mcast_stats *dest);
@@ -741,6 +761,10 @@ static inline int br_multicast_init_stats(struct net_bridge *br)
return 0;
}
+static inline void br_multicast_uninit_stats(struct net_bridge *br)
+{
+}
+
static inline int br_multicast_igmp_type(const struct sk_buff *skb)
{
return 0;
@@ -756,6 +780,7 @@ bool br_allowed_egress(struct net_bridge_vlan_group *vg,
const struct sk_buff *skb);
bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid);
struct sk_buff *br_handle_vlan(struct net_bridge *br,
+ const struct net_bridge_port *port,
struct net_bridge_vlan_group *vg,
struct sk_buff *skb);
int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags);
@@ -855,6 +880,7 @@ static inline bool br_should_learn(struct net_bridge_port *p,
}
static inline struct sk_buff *br_handle_vlan(struct net_bridge *br,
+ const struct net_bridge_port *port,
struct net_bridge_vlan_group *vg,
struct sk_buff *skb)
{
@@ -992,6 +1018,7 @@ void __br_set_forward_delay(struct net_bridge *br, unsigned long t);
int br_set_forward_delay(struct net_bridge *br, unsigned long x);
int br_set_hello_time(struct net_bridge *br, unsigned long x);
int br_set_max_age(struct net_bridge *br, unsigned long x);
+int __set_ageing_time(struct net_device *dev, unsigned long t);
int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time);
diff --git a/net/bridge/br_private_stp.h b/net/bridge/br_private_stp.h
index 2fe910c4e170..3f7543a29b76 100644
--- a/net/bridge/br_private_stp.h
+++ b/net/bridge/br_private_stp.h
@@ -61,6 +61,7 @@ void br_received_tcn_bpdu(struct net_bridge_port *p);
void br_transmit_config(struct net_bridge_port *p);
void br_transmit_tcn(struct net_bridge *br);
void br_topology_change_detection(struct net_bridge *br);
+void __br_set_topology_change(struct net_bridge *br, unsigned char val);
/* br_stp_bpdu.c */
void br_send_config_bpdu(struct net_bridge_port *, struct br_config_bpdu *);
diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h
new file mode 100644
index 000000000000..4a447a378ab3
--- /dev/null
+++ b/net/bridge/br_private_tunnel.h
@@ -0,0 +1,83 @@
+/*
+ * Bridge per vlan tunnels
+ *
+ * Authors:
+ * Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _BR_PRIVATE_TUNNEL_H
+#define _BR_PRIVATE_TUNNEL_H
+
+struct vtunnel_info {
+ u32 tunid;
+ u16 vid;
+ u16 flags;
+};
+
+/* br_netlink_tunnel.c */
+int br_parse_vlan_tunnel_info(struct nlattr *attr,
+ struct vtunnel_info *tinfo);
+int br_process_vlan_tunnel_info(struct net_bridge *br,
+ struct net_bridge_port *p,
+ int cmd,
+ struct vtunnel_info *tinfo_curr,
+ struct vtunnel_info *tinfo_last);
+int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg);
+int br_fill_vlan_tunnel_info(struct sk_buff *skb,
+ struct net_bridge_vlan_group *vg);
+
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+/* br_vlan_tunnel.c */
+int vlan_tunnel_init(struct net_bridge_vlan_group *vg);
+void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg);
+int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid);
+int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id);
+void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port);
+void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *vlan);
+int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
+ struct net_bridge_port *p,
+ struct net_bridge_vlan_group *vg);
+int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
+ struct net_bridge_vlan *vlan);
+#else
+static inline int vlan_tunnel_init(struct net_bridge_vlan_group *vg)
+{
+ return 0;
+}
+
+static inline int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port,
+ u16 vid)
+{
+ return 0;
+}
+
+static inline int nbp_vlan_tunnel_info_add(struct net_bridge_port *port,
+ u16 vid, u32 tun_id)
+{
+ return 0;
+}
+
+static inline void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port)
+{
+}
+
+static inline void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *vlan)
+{
+}
+
+static inline int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
+ struct net_bridge_port *p,
+ struct net_bridge_vlan_group *vg)
+{
+ return 0;
+}
+#endif
+
+#endif
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 9258b8ef14ff..8f56c2d1f1a7 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -234,7 +234,7 @@ static void br_record_config_timeout_values(struct net_bridge *br,
br->max_age = bpdu->max_age;
br->hello_time = bpdu->hello_time;
br->forward_delay = bpdu->forward_delay;
- br->topology_change = bpdu->topology_change;
+ __br_set_topology_change(br, bpdu->topology_change);
}
/* called under bridge lock */
@@ -344,7 +344,7 @@ void br_topology_change_detection(struct net_bridge *br)
isroot ? "propagating" : "sending tcn bpdu");
if (isroot) {
- br->topology_change = 1;
+ __br_set_topology_change(br, 1);
mod_timer(&br->topology_change_timer, jiffies
+ br->bridge_forward_delay + br->bridge_max_age);
} else if (!br->topology_change_detected) {
@@ -562,6 +562,24 @@ int br_set_max_age(struct net_bridge *br, unsigned long val)
}
+/* called under bridge lock */
+int __set_ageing_time(struct net_device *dev, unsigned long t)
+{
+ struct switchdev_attr attr = {
+ .orig_dev = dev,
+ .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
+ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER,
+ .u.ageing_time = jiffies_to_clock_t(t),
+ };
+ int err;
+
+ err = switchdev_port_attr_set(dev, &attr);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ return 0;
+}
+
/* Set time interval that dynamic forwarding entries live
* For pure software bridge, allow values outside the 802.1
* standard specification for special cases:
@@ -572,25 +590,52 @@ int br_set_max_age(struct net_bridge *br, unsigned long val)
*/
int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time)
{
- struct switchdev_attr attr = {
- .orig_dev = br->dev,
- .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
- .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
- .u.ageing_time = ageing_time,
- };
unsigned long t = clock_t_to_jiffies(ageing_time);
int err;
- err = switchdev_port_attr_set(br->dev, &attr);
- if (err && err != -EOPNOTSUPP)
+ err = __set_ageing_time(br->dev, t);
+ if (err)
return err;
+ spin_lock_bh(&br->lock);
+ br->bridge_ageing_time = t;
br->ageing_time = t;
- mod_timer(&br->gc_timer, jiffies);
+ spin_unlock_bh(&br->lock);
+
+ mod_delayed_work(system_long_wq, &br->gc_work, 0);
return 0;
}
+/* called under bridge lock */
+void __br_set_topology_change(struct net_bridge *br, unsigned char val)
+{
+ unsigned long t;
+ int err;
+
+ if (br->stp_enabled == BR_KERNEL_STP && br->topology_change != val) {
+ /* On topology change, set the bridge ageing time to twice the
+ * forward delay. Otherwise, restore its default ageing time.
+ */
+
+ if (val) {
+ t = 2 * br->forward_delay;
+ br_debug(br, "decreasing ageing time to %lu\n", t);
+ } else {
+ t = br->bridge_ageing_time;
+ br_debug(br, "restoring ageing time to %lu\n", t);
+ }
+
+ err = __set_ageing_time(br->dev, t);
+ if (err)
+ br_warn(br, "error offloading ageing time\n");
+ else
+ br->ageing_time = t;
+ }
+
+ br->topology_change = val;
+}
+
void __br_set_forward_delay(struct net_bridge *br, unsigned long t)
{
br->bridge_forward_delay = t;
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index d8ad73b38de2..08341d2aa9c9 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -36,12 +36,6 @@ static inline port_id br_make_port_id(__u8 priority, __u16 port_no)
/* called under bridge lock */
void br_init_port(struct net_bridge_port *p)
{
- struct switchdev_attr attr = {
- .orig_dev = p->dev,
- .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
- .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER,
- .u.ageing_time = jiffies_to_clock_t(p->br->ageing_time),
- };
int err;
p->port_id = br_make_port_id(p->priority, p->port_no);
@@ -50,9 +44,9 @@ void br_init_port(struct net_bridge_port *p)
p->topology_change_ack = 0;
p->config_pending = 0;
- err = switchdev_port_attr_set(p->dev, &attr);
- if (err && err != -EOPNOTSUPP)
- netdev_err(p->dev, "failed to set HW ageing time\n");
+ err = __set_ageing_time(p->dev, p->br->ageing_time);
+ if (err)
+ netdev_err(p->dev, "failed to offload ageing time\n");
}
/* NO locks held */
@@ -63,7 +57,7 @@ void br_stp_enable_bridge(struct net_bridge *br)
spin_lock_bh(&br->lock);
if (br->stp_enabled == BR_KERNEL_STP)
mod_timer(&br->hello_timer, jiffies + br->hello_time);
- mod_timer(&br->gc_timer, jiffies + HZ/10);
+ mod_delayed_work(system_long_wq, &br->gc_work, HZ / 10);
br_config_bpdu_generation(br);
@@ -87,14 +81,14 @@ void br_stp_disable_bridge(struct net_bridge *br)
}
- br->topology_change = 0;
+ __br_set_topology_change(br, 0);
br->topology_change_detected = 0;
spin_unlock_bh(&br->lock);
del_timer_sync(&br->hello_timer);
del_timer_sync(&br->topology_change_timer);
del_timer_sync(&br->tcn_timer);
- del_timer_sync(&br->gc_timer);
+ cancel_delayed_work_sync(&br->gc_work);
}
/* called under bridge lock */
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index da058b85aa22..c98b3e5c140a 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -125,7 +125,7 @@ static void br_topology_change_timer_expired(unsigned long arg)
br_debug(br, "topo change timer expired\n");
spin_lock(&br->lock);
br->topology_change_detected = 0;
- br->topology_change = 0;
+ __br_set_topology_change(br, 0);
spin_unlock(&br->lock);
}
@@ -153,8 +153,6 @@ void br_stp_timer_init(struct net_bridge *br)
setup_timer(&br->topology_change_timer,
br_topology_change_timer_expired,
(unsigned long) br);
-
- setup_timer(&br->gc_timer, br_fdb_cleanup, (unsigned long) br);
}
void br_stp_port_timer_init(struct net_bridge_port *p)
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index f88c4df3f91e..0b5dd607444c 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -19,6 +19,7 @@
#include <linux/rtnetlink.h>
#include <linux/spinlock.h>
#include <linux/times.h>
+#include <linux/sched/signal.h>
#include "br_private.h"
@@ -263,7 +264,7 @@ static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%ld\n", br_timer_value(&br->gc_timer));
+ return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer));
}
static DEVICE_ATTR_RO(gc_timer);
@@ -440,6 +441,23 @@ static ssize_t hash_max_store(struct device *d, struct device_attribute *attr,
}
static DEVICE_ATTR_RW(hash_max);
+static ssize_t multicast_igmp_version_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+
+ return sprintf(buf, "%u\n", br->multicast_igmp_version);
+}
+
+static ssize_t multicast_igmp_version_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_multicast_set_igmp_version);
+}
+static DEVICE_ATTR_RW(multicast_igmp_version);
+
static ssize_t multicast_last_member_count_show(struct device *d,
struct device_attribute *attr,
char *buf)
@@ -642,6 +660,25 @@ static ssize_t multicast_stats_enabled_store(struct device *d,
return store_bridge_parm(d, buf, len, set_stats_enabled);
}
static DEVICE_ATTR_RW(multicast_stats_enabled);
+
+#if IS_ENABLED(CONFIG_IPV6)
+static ssize_t multicast_mld_version_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+
+ return sprintf(buf, "%u\n", br->multicast_mld_version);
+}
+
+static ssize_t multicast_mld_version_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_multicast_set_mld_version);
+}
+static DEVICE_ATTR_RW(multicast_mld_version);
+#endif
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
static ssize_t nf_call_iptables_show(
@@ -809,6 +846,10 @@ static struct attribute *bridge_attrs[] = {
&dev_attr_multicast_query_response_interval.attr,
&dev_attr_multicast_startup_query_interval.attr,
&dev_attr_multicast_stats_enabled.attr,
+ &dev_attr_multicast_igmp_version.attr,
+#if IS_ENABLED(CONFIG_IPV6)
+ &dev_attr_multicast_mld_version.attr,
+#endif
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
&dev_attr_nf_call_iptables.attr,
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 8bd569695e76..79aee759aba5 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -17,6 +17,7 @@
#include <linux/if_bridge.h>
#include <linux/rtnetlink.h>
#include <linux/spinlock.h>
+#include <linux/sched/signal.h>
#include "br_private.h"
@@ -188,6 +189,7 @@ static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router,
store_multicast_router);
BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE);
+BRPORT_ATTR_FLAG(multicast_to_unicast, BR_MULTICAST_TO_UNICAST);
#endif
static const struct brport_attribute *brport_attrs[] = {
@@ -214,6 +216,7 @@ static const struct brport_attribute *brport_attrs[] = {
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
&brport_attr_multicast_router,
&brport_attr_multicast_fast_leave,
+ &brport_attr_multicast_to_unicast,
#endif
&brport_attr_proxyarp,
&brport_attr_proxyarp_wifi,
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index b6de4f457161..b838213c408e 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -5,6 +5,7 @@
#include <net/switchdev.h>
#include "br_private.h"
+#include "br_private_tunnel.h"
static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg,
const void *ptr)
@@ -310,6 +311,7 @@ static int __vlan_del(struct net_bridge_vlan *v)
}
if (masterv != v) {
+ vlan_tunnel_info_del(vg, v);
rhashtable_remove_fast(&vg->vlan_hash, &v->vnode,
br_vlan_rht_params);
__vlan_del_list(v);
@@ -325,6 +327,7 @@ static void __vlan_group_free(struct net_bridge_vlan_group *vg)
{
WARN_ON(!list_empty(&vg->vlan_list));
rhashtable_destroy(&vg->vlan_hash);
+ vlan_tunnel_deinit(vg);
kfree(vg);
}
@@ -338,6 +341,7 @@ static void __vlan_flush(struct net_bridge_vlan_group *vg)
}
struct sk_buff *br_handle_vlan(struct net_bridge *br,
+ const struct net_bridge_port *p,
struct net_bridge_vlan_group *vg,
struct sk_buff *skb)
{
@@ -378,6 +382,12 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)
skb->vlan_tci = 0;
+
+ if (p && (p->flags & BR_VLAN_TUNNEL) &&
+ br_handle_egress_vlan_tunnel(skb, v)) {
+ kfree_skb(skb);
+ return NULL;
+ }
out:
return skb;
}
@@ -613,6 +623,8 @@ int br_vlan_delete(struct net_bridge *br, u16 vid)
br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid);
br_fdb_delete_by_port(br, NULL, vid, 0);
+ vlan_tunnel_info_del(vg, v);
+
return __vlan_del(v);
}
@@ -918,6 +930,9 @@ int br_vlan_init(struct net_bridge *br)
ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
if (ret)
goto err_rhtbl;
+ ret = vlan_tunnel_init(vg);
+ if (ret)
+ goto err_tunnel_init;
INIT_LIST_HEAD(&vg->vlan_list);
br->vlan_proto = htons(ETH_P_8021Q);
br->default_pvid = 1;
@@ -932,6 +947,8 @@ out:
return ret;
err_vlan_add:
+ vlan_tunnel_deinit(vg);
+err_tunnel_init:
rhashtable_destroy(&vg->vlan_hash);
err_rhtbl:
kfree(vg);
@@ -961,6 +978,9 @@ int nbp_vlan_init(struct net_bridge_port *p)
ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
if (ret)
goto err_rhtbl;
+ ret = vlan_tunnel_init(vg);
+ if (ret)
+ goto err_tunnel_init;
INIT_LIST_HEAD(&vg->vlan_list);
rcu_assign_pointer(p->vlgrp, vg);
if (p->br->default_pvid) {
@@ -976,9 +996,11 @@ out:
err_vlan_add:
RCU_INIT_POINTER(p->vlgrp, NULL);
synchronize_rcu();
+ vlan_tunnel_deinit(vg);
+err_tunnel_init:
rhashtable_destroy(&vg->vlan_hash);
-err_vlan_enabled:
err_rhtbl:
+err_vlan_enabled:
kfree(vg);
goto out;
diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c
new file mode 100644
index 000000000000..6d2c4eed2dc8
--- /dev/null
+++ b/net/bridge/br_vlan_tunnel.c
@@ -0,0 +1,205 @@
+/*
+ * Bridge per vlan tunnel port dst_metadata handling code
+ *
+ * Authors:
+ * Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <net/switchdev.h>
+#include <net/dst_metadata.h>
+
+#include "br_private.h"
+#include "br_private_tunnel.h"
+
+static inline int br_vlan_tunid_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct net_bridge_vlan *vle = ptr;
+ __be64 tunid = *(__be64 *)arg->key;
+
+ return vle->tinfo.tunnel_id != tunid;
+}
+
+static const struct rhashtable_params br_vlan_tunnel_rht_params = {
+ .head_offset = offsetof(struct net_bridge_vlan, tnode),
+ .key_offset = offsetof(struct net_bridge_vlan, tinfo.tunnel_id),
+ .key_len = sizeof(__be64),
+ .nelem_hint = 3,
+ .locks_mul = 1,
+ .obj_cmpfn = br_vlan_tunid_cmp,
+ .automatic_shrinking = true,
+};
+
+static struct net_bridge_vlan *br_vlan_tunnel_lookup(struct rhashtable *tbl,
+ u64 tunnel_id)
+{
+ return rhashtable_lookup_fast(tbl, &tunnel_id,
+ br_vlan_tunnel_rht_params);
+}
+
+void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *vlan)
+{
+ if (!vlan->tinfo.tunnel_dst)
+ return;
+ rhashtable_remove_fast(&vg->tunnel_hash, &vlan->tnode,
+ br_vlan_tunnel_rht_params);
+ vlan->tinfo.tunnel_id = 0;
+ dst_release(&vlan->tinfo.tunnel_dst->dst);
+ vlan->tinfo.tunnel_dst = NULL;
+}
+
+static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *vlan, u32 tun_id)
+{
+ struct metadata_dst *metadata = NULL;
+ __be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id));
+ int err;
+
+ if (vlan->tinfo.tunnel_dst)
+ return -EEXIST;
+
+ metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY,
+ key, 0);
+ if (!metadata)
+ return -EINVAL;
+
+ metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE;
+ vlan->tinfo.tunnel_dst = metadata;
+ vlan->tinfo.tunnel_id = key;
+
+ err = rhashtable_lookup_insert_fast(&vg->tunnel_hash, &vlan->tnode,
+ br_vlan_tunnel_rht_params);
+ if (err)
+ goto out;
+
+ return 0;
+out:
+ dst_release(&vlan->tinfo.tunnel_dst->dst);
+ vlan->tinfo.tunnel_dst = NULL;
+ vlan->tinfo.tunnel_id = 0;
+
+ return err;
+}
+
+/* Must be protected by RTNL.
+ * Must be called with vid in range from 1 to 4094 inclusive.
+ */
+int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *vlan;
+
+ ASSERT_RTNL();
+
+ vg = nbp_vlan_group(port);
+ vlan = br_vlan_find(vg, vid);
+ if (!vlan)
+ return -EINVAL;
+
+ return __vlan_tunnel_info_add(vg, vlan, tun_id);
+}
+
+/* Must be protected by RTNL.
+ * Must be called with vid in range from 1 to 4094 inclusive.
+ */
+int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+
+ ASSERT_RTNL();
+
+ vg = nbp_vlan_group(port);
+ v = br_vlan_find(vg, vid);
+ if (!v)
+ return -ENOENT;
+
+ vlan_tunnel_info_del(vg, v);
+
+ return 0;
+}
+
+static void __vlan_tunnel_info_flush(struct net_bridge_vlan_group *vg)
+{
+ struct net_bridge_vlan *vlan, *tmp;
+
+ list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist)
+ vlan_tunnel_info_del(vg, vlan);
+}
+
+void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port)
+{
+ struct net_bridge_vlan_group *vg;
+
+ ASSERT_RTNL();
+
+ vg = nbp_vlan_group(port);
+ __vlan_tunnel_info_flush(vg);
+}
+
+int vlan_tunnel_init(struct net_bridge_vlan_group *vg)
+{
+ return rhashtable_init(&vg->tunnel_hash, &br_vlan_tunnel_rht_params);
+}
+
+void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg)
+{
+ rhashtable_destroy(&vg->tunnel_hash);
+}
+
+int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
+ struct net_bridge_port *p,
+ struct net_bridge_vlan_group *vg)
+{
+ struct ip_tunnel_info *tinfo = skb_tunnel_info(skb);
+ struct net_bridge_vlan *vlan;
+
+ if (!vg || !tinfo)
+ return 0;
+
+ /* if already tagged, ignore */
+ if (skb_vlan_tagged(skb))
+ return 0;
+
+ /* lookup vid, given tunnel id */
+ vlan = br_vlan_tunnel_lookup(&vg->tunnel_hash, tinfo->key.tun_id);
+ if (!vlan)
+ return 0;
+
+ skb_dst_drop(skb);
+
+ __vlan_hwaccel_put_tag(skb, p->br->vlan_proto, vlan->vid);
+
+ return 0;
+}
+
+int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
+ struct net_bridge_vlan *vlan)
+{
+ int err;
+
+ if (!vlan || !vlan->tinfo.tunnel_id)
+ return 0;
+
+ if (unlikely(!skb_vlan_tag_present(skb)))
+ return 0;
+
+ skb_dst_drop(skb);
+ err = skb_vlan_pop(skb);
+ if (err)
+ return err;
+
+ skb_dst_set(skb, dst_clone(&vlan->tinfo.tunnel_dst->dst));
+
+ return 0;
+}
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 9cebf47ac840..e7ef1a1ef3a6 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -22,6 +22,7 @@ config NFT_BRIDGE_REJECT
config NF_LOG_BRIDGE
tristate "Bridge packet logging"
+ select NF_LOG_COMMON
endif # NF_TABLES_BRIDGE
diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
index 9024283d2bca..279527f8b1fe 100644
--- a/net/bridge/netfilter/ebt_among.c
+++ b/net/bridge/netfilter/ebt_among.c
@@ -187,7 +187,7 @@ static int ebt_among_mt_check(const struct xt_mtchk_param *par)
expected_length += ebt_mac_wormhash_size(wh_src);
if (em->match_size != EBT_ALIGN(expected_length)) {
- pr_info("wrong size: %d against expected %d, rounded to %Zd\n",
+ pr_info("wrong size: %d against expected %d, rounded to %zd\n",
em->match_size, expected_length,
EBT_ALIGN(expected_length));
return -EINVAL;
diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index 070cf134a22f..5929309beaa1 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -51,7 +51,8 @@ ebt_arpreply_tg(struct sk_buff *skb, const struct xt_action_param *par)
if (diptr == NULL)
return EBT_DROP;
- arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr, (struct net_device *)par->in,
+ arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr,
+ (struct net_device *)xt_in(par),
*diptr, shp, info->mac, shp);
return info->target;
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
index 517e78befcb2..61a9f1be1263 100644
--- a/net/bridge/netfilter/ebt_limit.c
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -105,6 +105,7 @@ static struct xt_match ebt_limit_mt_reg __read_mostly = {
.match = ebt_limit_mt,
.checkentry = ebt_limit_mt_check,
.matchsize = sizeof(struct ebt_limit_info),
+ .usersize = offsetof(struct ebt_limit_info, prev),
#ifdef CONFIG_COMPAT
.compatsize = sizeof(struct ebt_compat_limit_info),
#endif
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 9a11086ba6ff..98b9c8e8615e 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -78,7 +78,7 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
unsigned int bitmask;
/* FIXME: Disabled from containers until syslog ns is supported */
- if (!net_eq(net, &init_net))
+ if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
return;
spin_lock_bh(&ebt_log_lock);
@@ -179,7 +179,7 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_log_info *info = par->targinfo;
struct nf_loginfo li;
- struct net *net = par->net;
+ struct net *net = xt_net(par);
li.type = NF_LOG_TYPE_LOG;
li.u.log.level = info->loglevel;
@@ -190,11 +190,12 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par)
* nf_log_packet() with NFT_LOG_TYPE_LOG here. --Pablo
*/
if (info->bitmask & EBT_LOG_NFLOG)
- nf_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb,
- par->in, par->out, &li, "%s", info->prefix);
+ nf_log_packet(net, NFPROTO_BRIDGE, xt_hooknum(par), skb,
+ xt_in(par), xt_out(par), &li, "%s",
+ info->prefix);
else
- ebt_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb, par->in,
- par->out, &li, info->prefix);
+ ebt_log_packet(net, NFPROTO_BRIDGE, xt_hooknum(par), skb,
+ xt_in(par), xt_out(par), &li, info->prefix);
return EBT_CONTINUE;
}
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
index 54816150608e..c1dc48686200 100644
--- a/net/bridge/netfilter/ebt_nflog.c
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -23,16 +23,16 @@ static unsigned int
ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_nflog_info *info = par->targinfo;
+ struct net *net = xt_net(par);
struct nf_loginfo li;
- struct net *net = par->net;
li.type = NF_LOG_TYPE_ULOG;
li.u.ulog.copy_len = info->len;
li.u.ulog.group = info->group;
li.u.ulog.qthreshold = info->threshold;
- nf_log_packet(net, PF_BRIDGE, par->hooknum, skb, par->in,
- par->out, &li, "%s", info->prefix);
+ nf_log_packet(net, PF_BRIDGE, xt_hooknum(par), skb, xt_in(par),
+ xt_out(par), &li, "%s", info->prefix);
return EBT_CONTINUE;
}
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 2e7c4f974340..8d2a85e0594e 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -23,12 +23,12 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
if (!skb_make_writable(skb, 0))
return EBT_DROP;
- if (par->hooknum != NF_BR_BROUTING)
+ if (xt_hooknum(par) != NF_BR_BROUTING)
/* rcu_read_lock()ed by nf_hook_thresh */
ether_addr_copy(eth_hdr(skb)->h_dest,
- br_port_get_rcu(par->in)->br->dev->dev_addr);
+ br_port_get_rcu(xt_in(par))->br->dev->dev_addr);
else
- ether_addr_copy(eth_hdr(skb)->h_dest, par->in->dev_addr);
+ ether_addr_copy(eth_hdr(skb)->h_dest, xt_in(par)->dev_addr);
skb->pkt_type = PACKET_HOST;
return info->target;
}
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index ec94c6f1ae88..8fe36dc3aab2 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -53,7 +53,7 @@ static int ebt_broute(struct sk_buff *skb)
struct nf_hook_state state;
int ret;
- nf_hook_state_init(&state, NULL, NF_BR_BROUTING, INT_MIN,
+ nf_hook_state_init(&state, NF_BR_BROUTING,
NFPROTO_BRIDGE, skb->dev, NULL, NULL,
dev_net(skb->dev), NULL);
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index f5c11bbe27db..79b69917f521 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -23,7 +23,7 @@
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/smp.h>
#include <linux/cpumask.h>
#include <linux/audit.h>
@@ -194,12 +194,8 @@ unsigned int ebt_do_table(struct sk_buff *skb,
const struct ebt_table_info *private;
struct xt_action_param acpar;
- acpar.family = NFPROTO_BRIDGE;
- acpar.net = state->net;
- acpar.in = state->in;
- acpar.out = state->out;
+ acpar.state = state;
acpar.hotdrop = false;
- acpar.hooknum = hook;
read_lock_bh(&table->lock);
private = table->private;
@@ -1350,56 +1346,72 @@ static int update_counters(struct net *net, const void __user *user,
hlp.num_counters, user, len);
}
-static inline int ebt_make_matchname(const struct ebt_entry_match *m,
- const char *base, char __user *ubase)
+static inline int ebt_obj_to_user(char __user *um, const char *_name,
+ const char *data, int entrysize,
+ int usersize, int datasize)
{
- char __user *hlp = ubase + ((char *)m - base);
- char name[EBT_FUNCTION_MAXNAMELEN] = {};
+ char name[EBT_FUNCTION_MAXNAMELEN] = {0};
/* ebtables expects 32 bytes long names but xt_match names are 29 bytes
* long. Copy 29 bytes and fill remaining bytes with zeroes.
*/
- strlcpy(name, m->u.match->name, sizeof(name));
- if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN))
+ strlcpy(name, _name, sizeof(name));
+ if (copy_to_user(um, name, EBT_FUNCTION_MAXNAMELEN) ||
+ put_user(datasize, (int __user *)(um + EBT_FUNCTION_MAXNAMELEN)) ||
+ xt_data_to_user(um + entrysize, data, usersize, datasize))
return -EFAULT;
+
return 0;
}
-static inline int ebt_make_watchername(const struct ebt_entry_watcher *w,
- const char *base, char __user *ubase)
+static inline int ebt_match_to_user(const struct ebt_entry_match *m,
+ const char *base, char __user *ubase)
{
- char __user *hlp = ubase + ((char *)w - base);
- char name[EBT_FUNCTION_MAXNAMELEN] = {};
+ return ebt_obj_to_user(ubase + ((char *)m - base),
+ m->u.match->name, m->data, sizeof(*m),
+ m->u.match->usersize, m->match_size);
+}
- strlcpy(name, w->u.watcher->name, sizeof(name));
- if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN))
- return -EFAULT;
- return 0;
+static inline int ebt_watcher_to_user(const struct ebt_entry_watcher *w,
+ const char *base, char __user *ubase)
+{
+ return ebt_obj_to_user(ubase + ((char *)w - base),
+ w->u.watcher->name, w->data, sizeof(*w),
+ w->u.watcher->usersize, w->watcher_size);
}
-static inline int ebt_make_names(struct ebt_entry *e, const char *base,
- char __user *ubase)
+static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base,
+ char __user *ubase)
{
int ret;
char __user *hlp;
const struct ebt_entry_target *t;
- char name[EBT_FUNCTION_MAXNAMELEN] = {};
- if (e->bitmask == 0)
+ if (e->bitmask == 0) {
+ /* special case !EBT_ENTRY_OR_ENTRIES */
+ if (copy_to_user(ubase + ((char *)e - base), e,
+ sizeof(struct ebt_entries)))
+ return -EFAULT;
return 0;
+ }
+
+ if (copy_to_user(ubase + ((char *)e - base), e, sizeof(*e)))
+ return -EFAULT;
hlp = ubase + (((char *)e + e->target_offset) - base);
t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
- ret = EBT_MATCH_ITERATE(e, ebt_make_matchname, base, ubase);
+ ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase);
if (ret != 0)
return ret;
- ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase);
+ ret = EBT_WATCHER_ITERATE(e, ebt_watcher_to_user, base, ubase);
if (ret != 0)
return ret;
- strlcpy(name, t->u.target->name, sizeof(name));
- if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN))
- return -EFAULT;
+ ret = ebt_obj_to_user(hlp, t->u.target->name, t->data, sizeof(*t),
+ t->u.target->usersize, t->target_size);
+ if (ret != 0)
+ return ret;
+
return 0;
}
@@ -1479,13 +1491,9 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user,
if (ret)
return ret;
- if (copy_to_user(tmp.entries, entries, entries_size)) {
- BUGPRINT("Couldn't copy entries to userspace\n");
- return -EFAULT;
- }
/* set the match/watcher/target names right */
return EBT_ENTRY_ITERATE(entries, entries_size,
- ebt_make_names, entries, tmp.entries);
+ ebt_entry_to_user, entries, tmp.entries);
}
static int do_ebt_set_ctl(struct sock *sk,
@@ -1634,8 +1642,10 @@ static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr,
if (match->compat_to_user) {
if (match->compat_to_user(cm->data, m->data))
return -EFAULT;
- } else if (copy_to_user(cm->data, m->data, msize))
+ } else {
+ if (xt_data_to_user(cm->data, m->data, match->usersize, msize))
return -EFAULT;
+ }
*size -= ebt_compat_entry_padsize() + off;
*dstptr = cm->data;
@@ -1661,8 +1671,10 @@ static int compat_target_to_user(struct ebt_entry_target *t,
if (target->compat_to_user) {
if (target->compat_to_user(cm->data, t->data))
return -EFAULT;
- } else if (copy_to_user(cm->data, t->data, tsize))
- return -EFAULT;
+ } else {
+ if (xt_data_to_user(cm->data, t->data, target->usersize, tsize))
+ return -EFAULT;
+ }
*size -= ebt_compat_entry_padsize() + off;
*dstptr = cm->data;
diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c
index 1663df598545..bd2b3c78f59b 100644
--- a/net/bridge/netfilter/nf_log_bridge.c
+++ b/net/bridge/netfilter/nf_log_bridge.c
@@ -24,21 +24,8 @@ static void nf_log_bridge_packet(struct net *net, u_int8_t pf,
const struct nf_loginfo *loginfo,
const char *prefix)
{
- switch (eth_hdr(skb)->h_proto) {
- case htons(ETH_P_IP):
- nf_log_packet(net, NFPROTO_IPV4, hooknum, skb, in, out,
- loginfo, "%s", prefix);
- break;
- case htons(ETH_P_IPV6):
- nf_log_packet(net, NFPROTO_IPV6, hooknum, skb, in, out,
- loginfo, "%s", prefix);
- break;
- case htons(ETH_P_ARP):
- case htons(ETH_P_RARP):
- nf_log_packet(net, NFPROTO_ARP, hooknum, skb, in, out,
- loginfo, "%s", prefix);
- break;
- }
+ nf_log_l2packet(net, pf, eth_hdr(skb)->h_proto, hooknum, skb,
+ in, out, loginfo, prefix);
}
static struct nf_logger nf_bridge_logger __read_mostly = {
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
index ad47a921b701..5974dbc1ea24 100644
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -23,7 +23,7 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
const struct nft_pktinfo *pkt)
{
const struct nft_meta *priv = nft_expr_priv(expr);
- const struct net_device *in = pkt->in, *out = pkt->out;
+ const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
u32 *dest = &regs->data[priv->dreg];
const struct net_bridge_port *p;
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index 4b3df6b0e3b9..206dc266ecd2 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -315,17 +315,20 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr,
case htons(ETH_P_IP):
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nft_reject_br_send_v4_unreach(pkt->net, pkt->skb,
- pkt->in, pkt->hook,
+ nft_reject_br_send_v4_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
priv->icmp_code);
break;
case NFT_REJECT_TCP_RST:
- nft_reject_br_send_v4_tcp_reset(pkt->net, pkt->skb,
- pkt->in, pkt->hook);
+ nft_reject_br_send_v4_tcp_reset(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt));
break;
case NFT_REJECT_ICMPX_UNREACH:
- nft_reject_br_send_v4_unreach(pkt->net, pkt->skb,
- pkt->in, pkt->hook,
+ nft_reject_br_send_v4_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
nft_reject_icmp_code(priv->icmp_code));
break;
}
@@ -333,17 +336,20 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr,
case htons(ETH_P_IPV6):
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nft_reject_br_send_v6_unreach(pkt->net, pkt->skb,
- pkt->in, pkt->hook,
+ nft_reject_br_send_v6_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
priv->icmp_code);
break;
case NFT_REJECT_TCP_RST:
- nft_reject_br_send_v6_tcp_reset(pkt->net, pkt->skb,
- pkt->in, pkt->hook);
+ nft_reject_br_send_v6_tcp_reset(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt));
break;
case NFT_REJECT_ICMPX_UNREACH:
- nft_reject_br_send_v6_unreach(pkt->net, pkt->skb,
- pkt->in, pkt->hook,
+ nft_reject_br_send_v6_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
nft_reject_icmpv6_code(priv->icmp_code));
break;
}
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index d730a0f68f46..2d38b6e34203 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -52,7 +52,7 @@ struct caif_net {
struct caif_device_entry_list caifdevs;
};
-static int caif_net_id;
+static unsigned int caif_net_id;
static int q_high = 50; /* Percent */
struct cfcnfg *get_cfcnfg(struct net *net)
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 92cbbd2afddb..adcad344c843 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -9,7 +9,7 @@
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <linux/list.h>
diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c
index fa39fc298708..273cb07f57d8 100644
--- a/net/caif/cfcnfg.c
+++ b/net/caif/cfcnfg.c
@@ -390,8 +390,7 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
rcu_read_lock();
if (adapt_layer == NULL) {
- pr_debug("link setup response but no client exist,"
- "send linkdown back\n");
+ pr_debug("link setup response but no client exist, send linkdown back\n");
cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL);
goto unlock;
}
@@ -401,8 +400,7 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid);
if (phyinfo == NULL) {
- pr_err("ERROR: Link Layer Device disappeared"
- "while connecting\n");
+ pr_err("ERROR: Link Layer Device disappeared while connecting\n");
goto unlock;
}
@@ -436,8 +434,7 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
servicel = cfdbgl_create(channel_id, &phyinfo->dev_info);
break;
default:
- pr_err("Protocol error. Link setup response "
- "- unknown channel type\n");
+ pr_err("Protocol error. Link setup response - unknown channel type\n");
goto unlock;
}
if (!servicel)
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 3408ed51b611..1816fc9f1ee7 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -44,7 +44,6 @@ enum caif_states {
struct chnl_net {
struct cflayer chnl;
- struct net_device_stats stats;
struct caif_connect_request conn_req;
struct list_head list_field;
struct net_device *netdev;
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 1108079d934f..5488e4a6ccd0 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -445,6 +445,7 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
* @func: callback function on filter match
* @data: returned parameter for callback function
* @ident: string for calling module identification
+ * @sk: socket pointer (might be NULL)
*
* Description:
* Invokes the callback function with the received sk_buff and the given
@@ -468,7 +469,7 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
*/
int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask,
void (*func)(struct sk_buff *, void *), void *data,
- char *ident)
+ char *ident, struct sock *sk)
{
struct receiver *r;
struct hlist_head *rl;
@@ -496,6 +497,7 @@ int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask,
r->func = func;
r->data = data;
r->ident = ident;
+ r->sk = sk;
hlist_add_head_rcu(&r->list, rl);
d->entries++;
@@ -520,8 +522,11 @@ EXPORT_SYMBOL(can_rx_register);
static void can_rx_delete_receiver(struct rcu_head *rp)
{
struct receiver *r = container_of(rp, struct receiver, rcu);
+ struct sock *sk = r->sk;
kmem_cache_free(rcv_cache, r);
+ if (sk)
+ sock_put(sk);
}
/**
@@ -596,8 +601,11 @@ void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask,
spin_unlock(&can_rcvlists_lock);
/* schedule the receiver item for deletion */
- if (r)
+ if (r) {
+ if (r->sk)
+ sock_hold(r->sk);
call_rcu(&r->rcu, can_rx_delete_receiver);
+ }
}
EXPORT_SYMBOL(can_rx_unregister);
diff --git a/net/can/af_can.h b/net/can/af_can.h
index fca0fe9fc45a..b86f5129e838 100644
--- a/net/can/af_can.h
+++ b/net/can/af_can.h
@@ -50,13 +50,14 @@
struct receiver {
struct hlist_node list;
- struct rcu_head rcu;
canid_t can_id;
canid_t mask;
unsigned long matches;
void (*func)(struct sk_buff *, void *);
void *data;
char *ident;
+ struct sock *sk;
+ struct rcu_head rcu;
};
#define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS)
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 436a7537e6a9..95d13b233c65 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -199,11 +199,11 @@ static int bcm_proc_show(struct seq_file *m, void *v)
seq_printf(m, "%c ", (op->flags & RX_CHECK_DLC) ? 'd' : ' ');
- if (op->kt_ival1.tv64)
+ if (op->kt_ival1)
seq_printf(m, "timeo=%lld ",
(long long)ktime_to_us(op->kt_ival1));
- if (op->kt_ival2.tv64)
+ if (op->kt_ival2)
seq_printf(m, "thr=%lld ",
(long long)ktime_to_us(op->kt_ival2));
@@ -226,11 +226,11 @@ static int bcm_proc_show(struct seq_file *m, void *v)
else
seq_printf(m, "[%u] ", op->nframes);
- if (op->kt_ival1.tv64)
+ if (op->kt_ival1)
seq_printf(m, "t1=%lld ",
(long long)ktime_to_us(op->kt_ival1));
- if (op->kt_ival2.tv64)
+ if (op->kt_ival2)
seq_printf(m, "t2=%lld ",
(long long)ktime_to_us(op->kt_ival2));
@@ -365,11 +365,11 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
static void bcm_tx_start_timer(struct bcm_op *op)
{
- if (op->kt_ival1.tv64 && op->count)
+ if (op->kt_ival1 && op->count)
hrtimer_start(&op->timer,
ktime_add(ktime_get(), op->kt_ival1),
HRTIMER_MODE_ABS);
- else if (op->kt_ival2.tv64)
+ else if (op->kt_ival2)
hrtimer_start(&op->timer,
ktime_add(ktime_get(), op->kt_ival2),
HRTIMER_MODE_ABS);
@@ -380,7 +380,7 @@ static void bcm_tx_timeout_tsklet(unsigned long data)
struct bcm_op *op = (struct bcm_op *)data;
struct bcm_msg_head msg_head;
- if (op->kt_ival1.tv64 && (op->count > 0)) {
+ if (op->kt_ival1 && (op->count > 0)) {
op->count--;
if (!op->count && (op->flags & TX_COUNTEVT)) {
@@ -398,7 +398,7 @@ static void bcm_tx_timeout_tsklet(unsigned long data)
}
bcm_can_tx(op);
- } else if (op->kt_ival2.tv64)
+ } else if (op->kt_ival2)
bcm_can_tx(op);
bcm_tx_start_timer(op);
@@ -459,7 +459,7 @@ static void bcm_rx_update_and_send(struct bcm_op *op,
lastdata->flags |= (RX_RECV|RX_THR);
/* throttling mode inactive ? */
- if (!op->kt_ival2.tv64) {
+ if (!op->kt_ival2) {
/* send RX_CHANGED to the user immediately */
bcm_rx_changed(op, lastdata);
return;
@@ -470,7 +470,7 @@ static void bcm_rx_update_and_send(struct bcm_op *op,
return;
/* first reception with enabled throttling mode */
- if (!op->kt_lastmsg.tv64)
+ if (!op->kt_lastmsg)
goto rx_changed_settime;
/* got a second frame inside a potential throttle period? */
@@ -537,7 +537,7 @@ static void bcm_rx_starttimer(struct bcm_op *op)
if (op->flags & RX_NO_AUTOTIMER)
return;
- if (op->kt_ival1.tv64)
+ if (op->kt_ival1)
hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL);
}
@@ -643,7 +643,7 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
} else {
/* rearm throttle handling */
- op->kt_lastmsg = ktime_set(0, 0);
+ op->kt_lastmsg = 0;
return HRTIMER_NORESTART;
}
}
@@ -734,14 +734,23 @@ static struct bcm_op *bcm_find_op(struct list_head *ops,
static void bcm_remove_op(struct bcm_op *op)
{
- hrtimer_cancel(&op->timer);
- hrtimer_cancel(&op->thrtimer);
-
- if (op->tsklet.func)
- tasklet_kill(&op->tsklet);
+ if (op->tsklet.func) {
+ while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) ||
+ test_bit(TASKLET_STATE_RUN, &op->tsklet.state) ||
+ hrtimer_active(&op->timer)) {
+ hrtimer_cancel(&op->timer);
+ tasklet_kill(&op->tsklet);
+ }
+ }
- if (op->thrtsklet.func)
- tasklet_kill(&op->thrtsklet);
+ if (op->thrtsklet.func) {
+ while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) ||
+ test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) ||
+ hrtimer_active(&op->thrtimer)) {
+ hrtimer_cancel(&op->thrtimer);
+ tasklet_kill(&op->thrtsklet);
+ }
+ }
if ((op->frames) && (op->frames != &op->sframe))
kfree(op->frames);
@@ -1005,7 +1014,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2);
/* disable an active timer due to zero values? */
- if (!op->kt_ival1.tv64 && !op->kt_ival2.tv64)
+ if (!op->kt_ival1 && !op->kt_ival2)
hrtimer_cancel(&op->timer);
}
@@ -1189,19 +1198,19 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2);
/* disable an active timer due to zero value? */
- if (!op->kt_ival1.tv64)
+ if (!op->kt_ival1)
hrtimer_cancel(&op->timer);
/*
* In any case cancel the throttle timer, flush
* potentially blocked msgs and reset throttle handling
*/
- op->kt_lastmsg = ktime_set(0, 0);
+ op->kt_lastmsg = 0;
hrtimer_cancel(&op->thrtimer);
bcm_rx_thr_flush(op, 1);
}
- if ((op->flags & STARTTIMER) && op->kt_ival1.tv64)
+ if ((op->flags & STARTTIMER) && op->kt_ival1)
hrtimer_start(&op->timer, op->kt_ival1,
HRTIMER_MODE_REL);
}
@@ -1216,7 +1225,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
err = can_rx_register(dev, op->can_id,
REGMASK(op->can_id),
bcm_rx_handler, op,
- "bcm");
+ "bcm", sk);
op->rx_reg_dev = dev;
dev_put(dev);
@@ -1225,7 +1234,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
} else
err = can_rx_register(NULL, op->can_id,
REGMASK(op->can_id),
- bcm_rx_handler, op, "bcm");
+ bcm_rx_handler, op, "bcm", sk);
if (err) {
/* this bcm rx op is broken -> remove it */
list_del(&op->list);
diff --git a/net/can/gw.c b/net/can/gw.c
index 455168718c2e..7056a1a2bb70 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -429,7 +429,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
/* clear the skb timestamp if not configured the other way */
if (!(gwj->flags & CGW_FLAGS_CAN_SRC_TSTAMP))
- nskb->tstamp.tv64 = 0;
+ nskb->tstamp = 0;
/* send to netdevice */
if (can_send(nskb, gwj->flags & CGW_FLAGS_CAN_ECHO))
@@ -442,7 +442,7 @@ static inline int cgw_register_filter(struct cgw_job *gwj)
{
return can_rx_register(gwj->src.dev, gwj->ccgw.filter.can_id,
gwj->ccgw.filter.can_mask, can_can_gw_rcv,
- gwj, "gw");
+ gwj, "gw", NULL);
}
static inline void cgw_unregister_filter(struct cgw_job *gwj)
diff --git a/net/can/raw.c b/net/can/raw.c
index b075f028d7e2..6dc546a06673 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -190,7 +190,7 @@ static int raw_enable_filters(struct net_device *dev, struct sock *sk,
for (i = 0; i < count; i++) {
err = can_rx_register(dev, filter[i].can_id,
filter[i].can_mask,
- raw_rcv, sk, "raw");
+ raw_rcv, sk, "raw", sk);
if (err) {
/* clean up successfully registered filters */
while (--i >= 0)
@@ -211,7 +211,7 @@ static int raw_enable_errfilter(struct net_device *dev, struct sock *sk,
if (err_mask)
err = can_rx_register(dev, 0, err_mask | CAN_ERR_FLAG,
- raw_rcv, sk, "raw");
+ raw_rcv, sk, "raw", sk);
return err;
}
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index c822b3ae1bd3..48bb8d95195b 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -315,13 +315,13 @@ int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
EXPORT_SYMBOL(ceph_auth_update_authorizer);
int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
- struct ceph_authorizer *a, size_t len)
+ struct ceph_authorizer *a)
{
int ret = 0;
mutex_lock(&ac->mutex);
if (ac->ops && ac->ops->verify_authorizer_reply)
- ret = ac->ops->verify_authorizer_reply(ac, a, len);
+ ret = ac->ops->verify_authorizer_reply(ac, a);
mutex_unlock(&ac->mutex);
return ret;
}
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index a0905f04bd13..2034fb926670 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -39,56 +39,58 @@ static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
return need != 0;
}
+static int ceph_x_encrypt_offset(void)
+{
+ return sizeof(u32) + sizeof(struct ceph_x_encrypt_header);
+}
+
static int ceph_x_encrypt_buflen(int ilen)
{
- return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
- sizeof(u32);
+ return ceph_x_encrypt_offset() + ilen + 16;
}
-static int ceph_x_encrypt(struct ceph_crypto_key *secret,
- void *ibuf, int ilen, void *obuf, size_t olen)
+static int ceph_x_encrypt(struct ceph_crypto_key *secret, void *buf,
+ int buf_len, int plaintext_len)
{
- struct ceph_x_encrypt_header head = {
- .struct_v = 1,
- .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
- };
- size_t len = olen - sizeof(u32);
+ struct ceph_x_encrypt_header *hdr = buf + sizeof(u32);
+ int ciphertext_len;
int ret;
- ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
- &head, sizeof(head), ibuf, ilen);
+ hdr->struct_v = 1;
+ hdr->magic = cpu_to_le64(CEPHX_ENC_MAGIC);
+
+ ret = ceph_crypt(secret, true, buf + sizeof(u32), buf_len - sizeof(u32),
+ plaintext_len + sizeof(struct ceph_x_encrypt_header),
+ &ciphertext_len);
if (ret)
return ret;
- ceph_encode_32(&obuf, len);
- return len + sizeof(u32);
+
+ ceph_encode_32(&buf, ciphertext_len);
+ return sizeof(u32) + ciphertext_len;
}
-static int ceph_x_decrypt(struct ceph_crypto_key *secret,
- void **p, void *end, void **obuf, size_t olen)
+static int ceph_x_decrypt(struct ceph_crypto_key *secret, void **p, void *end)
{
- struct ceph_x_encrypt_header head;
- size_t head_len = sizeof(head);
- int len, ret;
-
- len = ceph_decode_32(p);
- if (*p + len > end)
- return -EINVAL;
+ struct ceph_x_encrypt_header *hdr = *p + sizeof(u32);
+ int ciphertext_len, plaintext_len;
+ int ret;
- dout("ceph_x_decrypt len %d\n", len);
- if (*obuf == NULL) {
- *obuf = kmalloc(len, GFP_NOFS);
- if (!*obuf)
- return -ENOMEM;
- olen = len;
- }
+ ceph_decode_32_safe(p, end, ciphertext_len, e_inval);
+ ceph_decode_need(p, end, ciphertext_len, e_inval);
- ret = ceph_decrypt2(secret, &head, &head_len, *obuf, &olen, *p, len);
+ ret = ceph_crypt(secret, false, *p, end - *p, ciphertext_len,
+ &plaintext_len);
if (ret)
return ret;
- if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
+
+ if (hdr->struct_v != 1 || le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC)
return -EPERM;
- *p += len;
- return olen;
+
+ *p += ciphertext_len;
+ return plaintext_len - sizeof(struct ceph_x_encrypt_header);
+
+e_inval:
+ return -EINVAL;
}
/*
@@ -143,13 +145,10 @@ static int process_one_ticket(struct ceph_auth_client *ac,
int type;
u8 tkt_struct_v, blob_struct_v;
struct ceph_x_ticket_handler *th;
- void *dbuf = NULL;
void *dp, *dend;
int dlen;
char is_enc;
struct timespec validity;
- struct ceph_crypto_key old_key;
- void *ticket_buf = NULL;
void *tp, *tpend;
void **ptp;
struct ceph_crypto_key new_session_key;
@@ -174,20 +173,17 @@ static int process_one_ticket(struct ceph_auth_client *ac,
}
/* blob for me */
- dlen = ceph_x_decrypt(secret, p, end, &dbuf, 0);
- if (dlen <= 0) {
- ret = dlen;
+ dp = *p + ceph_x_encrypt_offset();
+ ret = ceph_x_decrypt(secret, p, end);
+ if (ret < 0)
goto out;
- }
- dout(" decrypted %d bytes\n", dlen);
- dp = dbuf;
- dend = dp + dlen;
+ dout(" decrypted %d bytes\n", ret);
+ dend = dp + ret;
tkt_struct_v = ceph_decode_8(&dp);
if (tkt_struct_v != 1)
goto bad;
- memcpy(&old_key, &th->session_key, sizeof(old_key));
ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
if (ret)
goto out;
@@ -203,15 +199,13 @@ static int process_one_ticket(struct ceph_auth_client *ac,
ceph_decode_8_safe(p, end, is_enc, bad);
if (is_enc) {
/* encrypted */
- dout(" encrypted ticket\n");
- dlen = ceph_x_decrypt(&old_key, p, end, &ticket_buf, 0);
- if (dlen < 0) {
- ret = dlen;
+ tp = *p + ceph_x_encrypt_offset();
+ ret = ceph_x_decrypt(&th->session_key, p, end);
+ if (ret < 0)
goto out;
- }
- tp = ticket_buf;
+ dout(" encrypted ticket, decrypted %d bytes\n", ret);
ptp = &tp;
- tpend = *ptp + dlen;
+ tpend = tp + ret;
} else {
/* unencrypted */
ptp = p;
@@ -242,8 +236,6 @@ static int process_one_ticket(struct ceph_auth_client *ac,
xi->have_keys |= th->service;
out:
- kfree(ticket_buf);
- kfree(dbuf);
return ret;
bad:
@@ -294,7 +286,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
{
int maxlen;
struct ceph_x_authorize_a *msg_a;
- struct ceph_x_authorize_b msg_b;
+ struct ceph_x_authorize_b *msg_b;
void *p, *end;
int ret;
int ticket_blob_len =
@@ -308,8 +300,8 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
if (ret)
goto out_au;
- maxlen = sizeof(*msg_a) + sizeof(msg_b) +
- ceph_x_encrypt_buflen(ticket_blob_len);
+ maxlen = sizeof(*msg_a) + ticket_blob_len +
+ ceph_x_encrypt_buflen(sizeof(*msg_b));
dout(" need len %d\n", maxlen);
if (au->buf && au->buf->alloc_len < maxlen) {
ceph_buffer_put(au->buf);
@@ -343,18 +335,19 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
p += ticket_blob_len;
end = au->buf->vec.iov_base + au->buf->vec.iov_len;
+ msg_b = p + ceph_x_encrypt_offset();
+ msg_b->struct_v = 1;
get_random_bytes(&au->nonce, sizeof(au->nonce));
- msg_b.struct_v = 1;
- msg_b.nonce = cpu_to_le64(au->nonce);
- ret = ceph_x_encrypt(&au->session_key, &msg_b, sizeof(msg_b),
- p, end - p);
+ msg_b->nonce = cpu_to_le64(au->nonce);
+ ret = ceph_x_encrypt(&au->session_key, p, end - p, sizeof(*msg_b));
if (ret < 0)
goto out_au;
+
p += ret;
+ WARN_ON(p > end);
au->buf->vec.iov_len = p - au->buf->vec.iov_base;
dout(" built authorizer nonce %llx len %d\n", au->nonce,
(int)au->buf->vec.iov_len);
- BUG_ON(au->buf->vec.iov_len > maxlen);
return 0;
out_au:
@@ -452,8 +445,9 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
if (need & CEPH_ENTITY_TYPE_AUTH) {
struct ceph_x_authenticate *auth = (void *)(head + 1);
void *p = auth + 1;
- struct ceph_x_challenge_blob tmp;
- char tmp_enc[40];
+ void *enc_buf = xi->auth_authorizer.enc_buf;
+ struct ceph_x_challenge_blob *blob = enc_buf +
+ ceph_x_encrypt_offset();
u64 *u;
if (p > end)
@@ -464,16 +458,16 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
/* encrypt and hash */
get_random_bytes(&auth->client_challenge, sizeof(u64));
- tmp.client_challenge = auth->client_challenge;
- tmp.server_challenge = cpu_to_le64(xi->server_challenge);
- ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
- tmp_enc, sizeof(tmp_enc));
+ blob->client_challenge = auth->client_challenge;
+ blob->server_challenge = cpu_to_le64(xi->server_challenge);
+ ret = ceph_x_encrypt(&xi->secret, enc_buf, CEPHX_AU_ENC_BUF_LEN,
+ sizeof(*blob));
if (ret < 0)
return ret;
auth->struct_v = 1;
auth->key = 0;
- for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
+ for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++)
auth->key ^= *(__le64 *)u;
dout(" server_challenge %llx client_challenge %llx key %llx\n",
xi->server_challenge, le64_to_cpu(auth->client_challenge),
@@ -600,8 +594,8 @@ static int ceph_x_create_authorizer(
auth->authorizer = (struct ceph_authorizer *) au;
auth->authorizer_buf = au->buf->vec.iov_base;
auth->authorizer_buf_len = au->buf->vec.iov_len;
- auth->authorizer_reply_buf = au->reply_buf;
- auth->authorizer_reply_buf_len = sizeof (au->reply_buf);
+ auth->authorizer_reply_buf = au->enc_buf;
+ auth->authorizer_reply_buf_len = CEPHX_AU_ENC_BUF_LEN;
auth->sign_message = ac->ops->sign_message;
auth->check_message_signature = ac->ops->check_message_signature;
@@ -629,27 +623,25 @@ static int ceph_x_update_authorizer(
}
static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
- struct ceph_authorizer *a, size_t len)
+ struct ceph_authorizer *a)
{
struct ceph_x_authorizer *au = (void *)a;
- int ret = 0;
- struct ceph_x_authorize_reply reply;
- void *preply = &reply;
- void *p = au->reply_buf;
- void *end = p + sizeof(au->reply_buf);
+ void *p = au->enc_buf;
+ struct ceph_x_authorize_reply *reply = p + ceph_x_encrypt_offset();
+ int ret;
- ret = ceph_x_decrypt(&au->session_key, &p, end, &preply, sizeof(reply));
+ ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN);
if (ret < 0)
return ret;
- if (ret != sizeof(reply))
+ if (ret != sizeof(*reply))
return -EPERM;
- if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
+ if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one))
ret = -EPERM;
else
ret = 0;
dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
- au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
+ au->nonce, le64_to_cpu(reply->nonce_plus_one), ret);
return ret;
}
@@ -704,35 +696,48 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
invalidate_ticket(ac, CEPH_ENTITY_TYPE_AUTH);
}
-static int calcu_signature(struct ceph_x_authorizer *au,
- struct ceph_msg *msg, __le64 *sig)
+static int calc_signature(struct ceph_x_authorizer *au, struct ceph_msg *msg,
+ __le64 *psig)
{
+ void *enc_buf = au->enc_buf;
+ struct {
+ __le32 len;
+ __le32 header_crc;
+ __le32 front_crc;
+ __le32 middle_crc;
+ __le32 data_crc;
+ } __packed *sigblock = enc_buf + ceph_x_encrypt_offset();
int ret;
- char tmp_enc[40];
- __le32 tmp[5] = {
- cpu_to_le32(16), msg->hdr.crc, msg->footer.front_crc,
- msg->footer.middle_crc, msg->footer.data_crc,
- };
- ret = ceph_x_encrypt(&au->session_key, &tmp, sizeof(tmp),
- tmp_enc, sizeof(tmp_enc));
+
+ sigblock->len = cpu_to_le32(4*sizeof(u32));
+ sigblock->header_crc = msg->hdr.crc;
+ sigblock->front_crc = msg->footer.front_crc;
+ sigblock->middle_crc = msg->footer.middle_crc;
+ sigblock->data_crc = msg->footer.data_crc;
+ ret = ceph_x_encrypt(&au->session_key, enc_buf, CEPHX_AU_ENC_BUF_LEN,
+ sizeof(*sigblock));
if (ret < 0)
return ret;
- *sig = *(__le64*)(tmp_enc + 4);
+
+ *psig = *(__le64 *)(enc_buf + sizeof(u32));
return 0;
}
static int ceph_x_sign_message(struct ceph_auth_handshake *auth,
struct ceph_msg *msg)
{
+ __le64 sig;
int ret;
if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN))
return 0;
- ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer,
- msg, &msg->footer.sig);
- if (ret < 0)
+ ret = calc_signature((struct ceph_x_authorizer *)auth->authorizer,
+ msg, &sig);
+ if (ret)
return ret;
+
+ msg->footer.sig = sig;
msg->footer.flags |= CEPH_MSG_FOOTER_SIGNED;
return 0;
}
@@ -746,9 +751,9 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth,
if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN))
return 0;
- ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer,
- msg, &sig_check);
- if (ret < 0)
+ ret = calc_signature((struct ceph_x_authorizer *)auth->authorizer,
+ msg, &sig_check);
+ if (ret)
return ret;
if (sig_check == msg->footer.sig)
return 0;
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
index 21a5af904bae..48e9ad41bd2a 100644
--- a/net/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -24,6 +24,7 @@ struct ceph_x_ticket_handler {
unsigned long renew_after, expires;
};
+#define CEPHX_AU_ENC_BUF_LEN 128 /* big enough for encrypted blob */
struct ceph_x_authorizer {
struct ceph_authorizer base;
@@ -32,7 +33,7 @@ struct ceph_x_authorizer {
unsigned int service;
u64 nonce;
u64 secret_id;
- char reply_buf[128]; /* big enough for encrypted blob */
+ char enc_buf[CEPHX_AU_ENC_BUF_LEN] __aligned(8);
};
struct ceph_x_info {
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 464e88599b9d..108533859a53 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -230,6 +230,7 @@ enum {
Opt_osdkeepalivetimeout,
Opt_mount_timeout,
Opt_osd_idle_ttl,
+ Opt_osd_request_timeout,
Opt_last_int,
/* int args above */
Opt_fsid,
@@ -256,6 +257,7 @@ static match_table_t opt_tokens = {
{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
{Opt_mount_timeout, "mount_timeout=%d"},
{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+ {Opt_osd_request_timeout, "osd_request_timeout=%d"},
/* int args above */
{Opt_fsid, "fsid=%s"},
{Opt_name, "name=%s"},
@@ -361,6 +363,7 @@ ceph_parse_options(char *options, const char *dev_name,
opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
+ opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
/* get mon ip(s) */
/* ip1[:port1][,ip2[:port2]...] */
@@ -473,6 +476,15 @@ ceph_parse_options(char *options, const char *dev_name,
}
opt->mount_timeout = msecs_to_jiffies(intval * 1000);
break;
+ case Opt_osd_request_timeout:
+ /* 0 is "wait forever" (i.e. infinite timeout) */
+ if (intval < 0 || intval > INT_MAX / 1000) {
+ pr_err("osd_request_timeout out of range\n");
+ err = -EINVAL;
+ goto out;
+ }
+ opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
+ break;
case Opt_share:
opt->flags &= ~CEPH_OPT_NOSHARE;
@@ -557,6 +569,9 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
seq_printf(m, "osdkeepalivetimeout=%d,",
jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
+ if (opt->osd_request_timeout != CEPH_OSD_REQUEST_TIMEOUT_DEFAULT)
+ seq_printf(m, "osd_request_timeout=%d,",
+ jiffies_to_msecs(opt->osd_request_timeout) / 1000);
/* drop redundant comma */
if (m->count != pos)
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index 50f040fdb2a9..b9233b990399 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -69,8 +69,8 @@ int ceph_cls_lock(struct ceph_osd_client *osdc,
dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n",
__func__, lock_name, type, cookie, tag, desc, flags);
ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock",
- CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
- lock_op_page, lock_op_buf_size, NULL, NULL);
+ CEPH_OSD_FLAG_WRITE, lock_op_page,
+ lock_op_buf_size, NULL, NULL);
dout("%s: status %d\n", __func__, ret);
__free_page(lock_op_page);
@@ -117,8 +117,8 @@ int ceph_cls_unlock(struct ceph_osd_client *osdc,
dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie);
ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock",
- CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
- unlock_op_page, unlock_op_buf_size, NULL, NULL);
+ CEPH_OSD_FLAG_WRITE, unlock_op_page,
+ unlock_op_buf_size, NULL, NULL);
dout("%s: status %d\n", __func__, ret);
__free_page(unlock_op_page);
@@ -170,8 +170,8 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc,
dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name,
cookie, ENTITY_NAME(*locker));
ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock",
- CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
- break_op_page, break_op_buf_size, NULL, NULL);
+ CEPH_OSD_FLAG_WRITE, break_op_page,
+ break_op_buf_size, NULL, NULL);
dout("%s: status %d\n", __func__, ret);
__free_page(break_op_page);
@@ -278,7 +278,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
int get_info_op_buf_size;
int name_len = strlen(lock_name);
struct page *get_info_op_page, *reply_page;
- size_t reply_len;
+ size_t reply_len = PAGE_SIZE;
void *p, *end;
int ret;
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 80d7c3a97cb8..5bf94c04f645 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
{
- kfree(b->h.perm);
kfree(b->h.items);
kfree(b);
}
@@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b)
{
kfree(b->item_weights);
kfree(b->sum_weights);
- kfree(b->h.perm);
kfree(b->h.items);
kfree(b);
}
void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
{
- kfree(b->h.perm);
kfree(b->h.items);
kfree(b->node_weights);
kfree(b);
@@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
{
kfree(b->straws);
kfree(b->item_weights);
- kfree(b->h.perm);
kfree(b->h.items);
kfree(b);
}
@@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
{
kfree(b->item_weights);
- kfree(b->h.perm);
kfree(b->h.items);
kfree(b);
}
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index a421e905331a..b5cd8c21bfdf 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -17,10 +17,12 @@
# include <linux/kernel.h>
# include <linux/crush/crush.h>
# include <linux/crush/hash.h>
+# include <linux/crush/mapper.h>
#else
# include "crush_compat.h"
# include "crush.h"
# include "hash.h"
+# include "mapper.h"
#endif
#include "crush_ln_table.h"
@@ -52,7 +54,6 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
return -1;
}
-
/*
* bucket choose methods
*
@@ -70,59 +71,60 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
* Since this is expensive, we optimize for the r=0 case, which
* captures the vast majority of calls.
*/
-static int bucket_perm_choose(struct crush_bucket *bucket,
+static int bucket_perm_choose(const struct crush_bucket *bucket,
+ struct crush_work_bucket *work,
int x, int r)
{
unsigned int pr = r % bucket->size;
unsigned int i, s;
/* start a new permutation if @x has changed */
- if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) {
+ if (work->perm_x != (__u32)x || work->perm_n == 0) {
dprintk("bucket %d new x=%d\n", bucket->id, x);
- bucket->perm_x = x;
+ work->perm_x = x;
/* optimize common r=0 case */
if (pr == 0) {
s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
bucket->size;
- bucket->perm[0] = s;
- bucket->perm_n = 0xffff; /* magic value, see below */
+ work->perm[0] = s;
+ work->perm_n = 0xffff; /* magic value, see below */
goto out;
}
for (i = 0; i < bucket->size; i++)
- bucket->perm[i] = i;
- bucket->perm_n = 0;
- } else if (bucket->perm_n == 0xffff) {
+ work->perm[i] = i;
+ work->perm_n = 0;
+ } else if (work->perm_n == 0xffff) {
/* clean up after the r=0 case above */
for (i = 1; i < bucket->size; i++)
- bucket->perm[i] = i;
- bucket->perm[bucket->perm[0]] = 0;
- bucket->perm_n = 1;
+ work->perm[i] = i;
+ work->perm[work->perm[0]] = 0;
+ work->perm_n = 1;
}
/* calculate permutation up to pr */
- for (i = 0; i < bucket->perm_n; i++)
- dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
- while (bucket->perm_n <= pr) {
- unsigned int p = bucket->perm_n;
+ for (i = 0; i < work->perm_n; i++)
+ dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
+ while (work->perm_n <= pr) {
+ unsigned int p = work->perm_n;
/* no point in swapping the final entry */
if (p < bucket->size - 1) {
i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
(bucket->size - p);
if (i) {
- unsigned int t = bucket->perm[p + i];
- bucket->perm[p + i] = bucket->perm[p];
- bucket->perm[p] = t;
+ unsigned int t = work->perm[p + i];
+ work->perm[p + i] = work->perm[p];
+ work->perm[p] = t;
}
dprintk(" perm_choose swap %d with %d\n", p, p+i);
}
- bucket->perm_n++;
+ work->perm_n++;
}
for (i = 0; i < bucket->size; i++)
- dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
+ dprintk(" perm_choose %d: %d\n", i, work->perm[i]);
- s = bucket->perm[pr];
+ s = work->perm[pr];
out:
dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
bucket->size, x, r, pr, s);
@@ -130,14 +132,14 @@ out:
}
/* uniform */
-static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
- int x, int r)
+static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
+ struct crush_work_bucket *work, int x, int r)
{
- return bucket_perm_choose(&bucket->h, x, r);
+ return bucket_perm_choose(&bucket->h, work, x, r);
}
/* list */
-static int bucket_list_choose(struct crush_bucket_list *bucket,
+static int bucket_list_choose(const struct crush_bucket_list *bucket,
int x, int r)
{
int i;
@@ -153,8 +155,9 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
w *= bucket->sum_weights[i];
w = w >> 16;
/*dprintk(" scaled %llx\n", w);*/
- if (w < bucket->item_weights[i])
+ if (w < bucket->item_weights[i]) {
return bucket->h.items[i];
+ }
}
dprintk("bad list sums for bucket %d\n", bucket->h.id);
@@ -190,7 +193,7 @@ static int terminal(int x)
return x & 1;
}
-static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
int x, int r)
{
int n;
@@ -222,7 +225,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
/* straw */
-static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
int x, int r)
{
__u32 i;
@@ -299,7 +302,7 @@ static __u64 crush_ln(unsigned int xin)
*
*/
-static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
+static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
int x, int r)
{
unsigned int i, high = 0;
@@ -342,37 +345,42 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
high_draw = draw;
}
}
+
return bucket->h.items[high];
}
-static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+static int crush_bucket_choose(const struct crush_bucket *in,
+ struct crush_work_bucket *work,
+ int x, int r)
{
dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
BUG_ON(in->size == 0);
switch (in->alg) {
case CRUSH_BUCKET_UNIFORM:
- return bucket_uniform_choose((struct crush_bucket_uniform *)in,
- x, r);
+ return bucket_uniform_choose(
+ (const struct crush_bucket_uniform *)in,
+ work, x, r);
case CRUSH_BUCKET_LIST:
- return bucket_list_choose((struct crush_bucket_list *)in,
+ return bucket_list_choose((const struct crush_bucket_list *)in,
x, r);
case CRUSH_BUCKET_TREE:
- return bucket_tree_choose((struct crush_bucket_tree *)in,
+ return bucket_tree_choose((const struct crush_bucket_tree *)in,
x, r);
case CRUSH_BUCKET_STRAW:
- return bucket_straw_choose((struct crush_bucket_straw *)in,
- x, r);
+ return bucket_straw_choose(
+ (const struct crush_bucket_straw *)in,
+ x, r);
case CRUSH_BUCKET_STRAW2:
- return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
- x, r);
+ return bucket_straw2_choose(
+ (const struct crush_bucket_straw2 *)in,
+ x, r);
default:
dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
return in->items[0];
}
}
-
/*
* true if device is marked "out" (failed, fully offloaded)
* of the cluster
@@ -414,7 +422,8 @@ static int is_out(const struct crush_map *map,
* @parent_r: r value passed from the parent
*/
static int crush_choose_firstn(const struct crush_map *map,
- struct crush_bucket *bucket,
+ struct crush_work *work,
+ const struct crush_bucket *bucket,
const __u32 *weight, int weight_max,
int x, int numrep, int type,
int *out, int outpos,
@@ -432,7 +441,7 @@ static int crush_choose_firstn(const struct crush_map *map,
int rep;
unsigned int ftotal, flocal;
int retry_descent, retry_bucket, skip_rep;
- struct crush_bucket *in = bucket;
+ const struct crush_bucket *in = bucket;
int r;
int i;
int item = 0;
@@ -471,9 +480,13 @@ static int crush_choose_firstn(const struct crush_map *map,
if (local_fallback_retries > 0 &&
flocal >= (in->size>>1) &&
flocal > local_fallback_retries)
- item = bucket_perm_choose(in, x, r);
+ item = bucket_perm_choose(
+ in, work->work[-1-in->id],
+ x, r);
else
- item = crush_bucket_choose(in, x, r);
+ item = crush_bucket_choose(
+ in, work->work[-1-in->id],
+ x, r);
if (item >= map->max_devices) {
dprintk(" bad item %d\n", item);
skip_rep = 1;
@@ -516,19 +529,21 @@ static int crush_choose_firstn(const struct crush_map *map,
sub_r = r >> (vary_r-1);
else
sub_r = 0;
- if (crush_choose_firstn(map,
- map->buckets[-1-item],
- weight, weight_max,
- x, stable ? 1 : outpos+1, 0,
- out2, outpos, count,
- recurse_tries, 0,
- local_retries,
- local_fallback_retries,
- 0,
- vary_r,
- stable,
- NULL,
- sub_r) <= outpos)
+ if (crush_choose_firstn(
+ map,
+ work,
+ map->buckets[-1-item],
+ weight, weight_max,
+ x, stable ? 1 : outpos+1, 0,
+ out2, outpos, count,
+ recurse_tries, 0,
+ local_retries,
+ local_fallback_retries,
+ 0,
+ vary_r,
+ stable,
+ NULL,
+ sub_r) <= outpos)
/* didn't get leaf */
reject = 1;
} else {
@@ -537,14 +552,12 @@ static int crush_choose_firstn(const struct crush_map *map,
}
}
- if (!reject) {
+ if (!reject && !collide) {
/* out? */
if (itemtype == 0)
reject = is_out(map, weight,
weight_max,
item, x);
- else
- reject = 0;
}
reject:
@@ -598,7 +611,8 @@ reject:
*
*/
static void crush_choose_indep(const struct crush_map *map,
- struct crush_bucket *bucket,
+ struct crush_work *work,
+ const struct crush_bucket *bucket,
const __u32 *weight, int weight_max,
int x, int left, int numrep, int type,
int *out, int outpos,
@@ -608,7 +622,7 @@ static void crush_choose_indep(const struct crush_map *map,
int *out2,
int parent_r)
{
- struct crush_bucket *in = bucket;
+ const struct crush_bucket *in = bucket;
int endpos = outpos + left;
int rep;
unsigned int ftotal;
@@ -676,7 +690,9 @@ static void crush_choose_indep(const struct crush_map *map,
break;
}
- item = crush_bucket_choose(in, x, r);
+ item = crush_bucket_choose(
+ in, work->work[-1-in->id],
+ x, r);
if (item >= map->max_devices) {
dprintk(" bad item %d\n", item);
out[rep] = CRUSH_ITEM_NONE;
@@ -722,13 +738,15 @@ static void crush_choose_indep(const struct crush_map *map,
if (recurse_to_leaf) {
if (item < 0) {
- crush_choose_indep(map,
- map->buckets[-1-item],
- weight, weight_max,
- x, 1, numrep, 0,
- out2, rep,
- recurse_tries, 0,
- 0, NULL, r);
+ crush_choose_indep(
+ map,
+ work,
+ map->buckets[-1-item],
+ weight, weight_max,
+ x, 1, numrep, 0,
+ out2, rep,
+ recurse_tries, 0,
+ 0, NULL, r);
if (out2[rep] == CRUSH_ITEM_NONE) {
/* placed nothing; no leaf */
break;
@@ -779,6 +797,53 @@ static void crush_choose_indep(const struct crush_map *map,
#endif
}
+
+/*
+ * This takes a chunk of memory and sets it up to be a shiny new
+ * working area for a CRUSH placement computation. It must be called
+ * on any newly allocated memory before passing it in to
+ * crush_do_rule. It may be used repeatedly after that, so long as the
+ * map has not changed. If the map /has/ changed, you must make sure
+ * the working size is no smaller than what was allocated and re-run
+ * crush_init_workspace.
+ *
+ * If you do retain the working space between calls to crush, make it
+ * thread-local.
+ */
+void crush_init_workspace(const struct crush_map *map, void *v)
+{
+ struct crush_work *w = v;
+ __s32 b;
+
+ /*
+ * We work by moving through the available space and setting
+ * values and pointers as we go.
+ *
+ * It's a bit like Forth's use of the 'allot' word since we
+ * set the pointer first and then reserve the space for it to
+ * point to by incrementing the point.
+ */
+ v += sizeof(struct crush_work *);
+ w->work = v;
+ v += map->max_buckets * sizeof(struct crush_work_bucket *);
+ for (b = 0; b < map->max_buckets; ++b) {
+ if (!map->buckets[b])
+ continue;
+
+ w->work[b] = v;
+ switch (map->buckets[b]->alg) {
+ default:
+ v += sizeof(struct crush_work_bucket);
+ break;
+ }
+ w->work[b]->perm_x = 0;
+ w->work[b]->perm_n = 0;
+ w->work[b]->perm = v;
+ v += map->buckets[b]->size * sizeof(__u32);
+ }
+ BUG_ON(v - (void *)w != map->working_size);
+}
+
/**
* crush_do_rule - calculate a mapping with the given input and rule
* @map: the crush_map
@@ -788,24 +853,25 @@ static void crush_choose_indep(const struct crush_map *map,
* @result_max: maximum result size
* @weight: weight vector (for map leaves)
* @weight_max: size of weight vector
- * @scratch: scratch vector for private use; must be >= 3 * result_max
+ * @cwin: pointer to at least crush_work_size() bytes of memory
*/
int crush_do_rule(const struct crush_map *map,
int ruleno, int x, int *result, int result_max,
const __u32 *weight, int weight_max,
- int *scratch)
+ void *cwin)
{
int result_len;
- int *a = scratch;
- int *b = scratch + result_max;
- int *c = scratch + result_max*2;
+ struct crush_work *cw = cwin;
+ int *a = cwin + map->working_size;
+ int *b = a + result_max;
+ int *c = b + result_max;
+ int *w = a;
+ int *o = b;
int recurse_to_leaf;
- int *w;
int wsize = 0;
- int *o;
int osize;
int *tmp;
- struct crush_rule *rule;
+ const struct crush_rule *rule;
__u32 step;
int i, j;
int numrep;
@@ -833,12 +899,10 @@ int crush_do_rule(const struct crush_map *map,
rule = map->rules[ruleno];
result_len = 0;
- w = a;
- o = b;
for (step = 0; step < rule->len; step++) {
int firstn = 0;
- struct crush_rule_step *curstep = &rule->steps[step];
+ const struct crush_rule_step *curstep = &rule->steps[step];
switch (curstep->op) {
case CRUSH_RULE_TAKE:
@@ -934,6 +998,7 @@ int crush_do_rule(const struct crush_map *map,
recurse_tries = choose_tries;
osize += crush_choose_firstn(
map,
+ cw,
map->buckets[bno],
weight, weight_max,
x, numrep,
@@ -954,6 +1019,7 @@ int crush_do_rule(const struct crush_map *map,
numrep : (result_max-osize));
crush_choose_indep(
map,
+ cw,
map->buckets[bno],
weight, weight_max,
x, out_size, numrep,
@@ -995,5 +1061,6 @@ int crush_do_rule(const struct crush_map *map,
break;
}
}
+
return result_len;
}
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index db2847ac5f12..46008d5ac504 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -3,24 +3,72 @@
#include <linux/err.h>
#include <linux/scatterlist.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <crypto/aes.h>
#include <crypto/skcipher.h>
#include <linux/key-type.h>
+#include <linux/sched/mm.h>
#include <keys/ceph-type.h>
#include <keys/user-type.h>
#include <linux/ceph/decode.h>
#include "crypto.h"
+/*
+ * Set ->key and ->tfm. The rest of the key should be filled in before
+ * this function is called.
+ */
+static int set_secret(struct ceph_crypto_key *key, void *buf)
+{
+ unsigned int noio_flag;
+ int ret;
+
+ key->key = NULL;
+ key->tfm = NULL;
+
+ switch (key->type) {
+ case CEPH_CRYPTO_NONE:
+ return 0; /* nothing to do */
+ case CEPH_CRYPTO_AES:
+ break;
+ default:
+ return -ENOTSUPP;
+ }
+
+ WARN_ON(!key->len);
+ key->key = kmemdup(buf, key->len, GFP_NOIO);
+ if (!key->key) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ /* crypto_alloc_skcipher() allocates with GFP_KERNEL */
+ noio_flag = memalloc_noio_save();
+ key->tfm = crypto_alloc_skcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+ memalloc_noio_restore(noio_flag);
+ if (IS_ERR(key->tfm)) {
+ ret = PTR_ERR(key->tfm);
+ key->tfm = NULL;
+ goto fail;
+ }
+
+ ret = crypto_skcipher_setkey(key->tfm, key->key, key->len);
+ if (ret)
+ goto fail;
+
+ return 0;
+
+fail:
+ ceph_crypto_key_destroy(key);
+ return ret;
+}
+
int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
const struct ceph_crypto_key *src)
{
memcpy(dst, src, sizeof(struct ceph_crypto_key));
- dst->key = kmemdup(src->key, src->len, GFP_NOFS);
- if (!dst->key)
- return -ENOMEM;
- return 0;
+ return set_secret(dst, src->key);
}
int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
@@ -37,16 +85,16 @@ int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
{
+ int ret;
+
ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
key->type = ceph_decode_16(p);
ceph_decode_copy(p, &key->created, sizeof(key->created));
key->len = ceph_decode_16(p);
ceph_decode_need(p, end, key->len, bad);
- key->key = kmalloc(key->len, GFP_NOFS);
- if (!key->key)
- return -ENOMEM;
- ceph_decode_copy(p, key->key, key->len);
- return 0;
+ ret = set_secret(key, *p);
+ *p += key->len;
+ return ret;
bad:
dout("failed to decode crypto key\n");
@@ -80,9 +128,14 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
return 0;
}
-static struct crypto_skcipher *ceph_crypto_alloc_cipher(void)
+void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
{
- return crypto_alloc_skcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (key) {
+ kfree(key->key);
+ key->key = NULL;
+ crypto_free_skcipher(key->tfm);
+ key->tfm = NULL;
+ }
}
static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
@@ -157,372 +210,82 @@ static void teardown_sgtable(struct sg_table *sgt)
sg_free_table(sgt);
}
-static int ceph_aes_encrypt(const void *key, int key_len,
- void *dst, size_t *dst_len,
- const void *src, size_t src_len)
-{
- struct scatterlist sg_in[2], prealloc_sg;
- struct sg_table sg_out;
- struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
- int ret;
- char iv[AES_BLOCK_SIZE];
- size_t zero_padding = (0x10 - (src_len & 0x0f));
- char pad[16];
-
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- memset(pad, zero_padding, zero_padding);
-
- *dst_len = src_len + zero_padding;
-
- sg_init_table(sg_in, 2);
- sg_set_buf(&sg_in[0], src, src_len);
- sg_set_buf(&sg_in[1], pad, zero_padding);
- ret = setup_sgtable(&sg_out, &prealloc_sg, dst, *dst_len);
- if (ret)
- goto out_tfm;
-
- crypto_skcipher_setkey((void *)tfm, key, key_len);
- memcpy(iv, aes_iv, AES_BLOCK_SIZE);
-
- skcipher_request_set_tfm(req, tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg_in, sg_out.sgl,
- src_len + zero_padding, iv);
-
- /*
- print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
- key, key_len, 1);
- print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
- src, src_len, 1);
- print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
- pad, zero_padding, 1);
- */
- ret = crypto_skcipher_encrypt(req);
- skcipher_request_zero(req);
- if (ret < 0) {
- pr_err("ceph_aes_crypt failed %d\n", ret);
- goto out_sg;
- }
- /*
- print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
- dst, *dst_len, 1);
- */
-
-out_sg:
- teardown_sgtable(&sg_out);
-out_tfm:
- crypto_free_skcipher(tfm);
- return ret;
-}
-
-static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
- size_t *dst_len,
- const void *src1, size_t src1_len,
- const void *src2, size_t src2_len)
-{
- struct scatterlist sg_in[3], prealloc_sg;
- struct sg_table sg_out;
- struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
- int ret;
- char iv[AES_BLOCK_SIZE];
- size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
- char pad[16];
-
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- memset(pad, zero_padding, zero_padding);
-
- *dst_len = src1_len + src2_len + zero_padding;
-
- sg_init_table(sg_in, 3);
- sg_set_buf(&sg_in[0], src1, src1_len);
- sg_set_buf(&sg_in[1], src2, src2_len);
- sg_set_buf(&sg_in[2], pad, zero_padding);
- ret = setup_sgtable(&sg_out, &prealloc_sg, dst, *dst_len);
- if (ret)
- goto out_tfm;
-
- crypto_skcipher_setkey((void *)tfm, key, key_len);
- memcpy(iv, aes_iv, AES_BLOCK_SIZE);
-
- skcipher_request_set_tfm(req, tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg_in, sg_out.sgl,
- src1_len + src2_len + zero_padding, iv);
-
- /*
- print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
- key, key_len, 1);
- print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
- src1, src1_len, 1);
- print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
- src2, src2_len, 1);
- print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
- pad, zero_padding, 1);
- */
- ret = crypto_skcipher_encrypt(req);
- skcipher_request_zero(req);
- if (ret < 0) {
- pr_err("ceph_aes_crypt2 failed %d\n", ret);
- goto out_sg;
- }
- /*
- print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
- dst, *dst_len, 1);
- */
-
-out_sg:
- teardown_sgtable(&sg_out);
-out_tfm:
- crypto_free_skcipher(tfm);
- return ret;
-}
-
-static int ceph_aes_decrypt(const void *key, int key_len,
- void *dst, size_t *dst_len,
- const void *src, size_t src_len)
+static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt,
+ void *buf, int buf_len, int in_len, int *pout_len)
{
- struct sg_table sg_in;
- struct scatterlist sg_out[2], prealloc_sg;
- struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
- char pad[16];
- char iv[AES_BLOCK_SIZE];
+ SKCIPHER_REQUEST_ON_STACK(req, key->tfm);
+ struct sg_table sgt;
+ struct scatterlist prealloc_sg;
+ char iv[AES_BLOCK_SIZE] __aligned(8);
+ int pad_byte = AES_BLOCK_SIZE - (in_len & (AES_BLOCK_SIZE - 1));
+ int crypt_len = encrypt ? in_len + pad_byte : in_len;
int ret;
- int last_byte;
-
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
- sg_init_table(sg_out, 2);
- sg_set_buf(&sg_out[0], dst, *dst_len);
- sg_set_buf(&sg_out[1], pad, sizeof(pad));
- ret = setup_sgtable(&sg_in, &prealloc_sg, src, src_len);
+ WARN_ON(crypt_len > buf_len);
+ if (encrypt)
+ memset(buf + in_len, pad_byte, pad_byte);
+ ret = setup_sgtable(&sgt, &prealloc_sg, buf, crypt_len);
if (ret)
- goto out_tfm;
+ return ret;
- crypto_skcipher_setkey((void *)tfm, key, key_len);
memcpy(iv, aes_iv, AES_BLOCK_SIZE);
-
- skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_tfm(req, key->tfm);
skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg_in.sgl, sg_out,
- src_len, iv);
+ skcipher_request_set_crypt(req, sgt.sgl, sgt.sgl, crypt_len, iv);
/*
- print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
- key, key_len, 1);
- print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
- src, src_len, 1);
+ print_hex_dump(KERN_ERR, "key: ", DUMP_PREFIX_NONE, 16, 1,
+ key->key, key->len, 1);
+ print_hex_dump(KERN_ERR, " in: ", DUMP_PREFIX_NONE, 16, 1,
+ buf, crypt_len, 1);
*/
- ret = crypto_skcipher_decrypt(req);
- skcipher_request_zero(req);
- if (ret < 0) {
- pr_err("ceph_aes_decrypt failed %d\n", ret);
- goto out_sg;
- }
-
- if (src_len <= *dst_len)
- last_byte = ((char *)dst)[src_len - 1];
+ if (encrypt)
+ ret = crypto_skcipher_encrypt(req);
else
- last_byte = pad[src_len - *dst_len - 1];
- if (last_byte <= 16 && src_len >= last_byte) {
- *dst_len = src_len - last_byte;
- } else {
- pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
- last_byte, (int)src_len);
- return -EPERM; /* bad padding */
- }
- /*
- print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
- dst, *dst_len, 1);
- */
-
-out_sg:
- teardown_sgtable(&sg_in);
-out_tfm:
- crypto_free_skcipher(tfm);
- return ret;
-}
-
-static int ceph_aes_decrypt2(const void *key, int key_len,
- void *dst1, size_t *dst1_len,
- void *dst2, size_t *dst2_len,
- const void *src, size_t src_len)
-{
- struct sg_table sg_in;
- struct scatterlist sg_out[3], prealloc_sg;
- struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
- char pad[16];
- char iv[AES_BLOCK_SIZE];
- int ret;
- int last_byte;
-
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- sg_init_table(sg_out, 3);
- sg_set_buf(&sg_out[0], dst1, *dst1_len);
- sg_set_buf(&sg_out[1], dst2, *dst2_len);
- sg_set_buf(&sg_out[2], pad, sizeof(pad));
- ret = setup_sgtable(&sg_in, &prealloc_sg, src, src_len);
- if (ret)
- goto out_tfm;
-
- crypto_skcipher_setkey((void *)tfm, key, key_len);
- memcpy(iv, aes_iv, AES_BLOCK_SIZE);
-
- skcipher_request_set_tfm(req, tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg_in.sgl, sg_out,
- src_len, iv);
-
- /*
- print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
- key, key_len, 1);
- print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
- src, src_len, 1);
- */
- ret = crypto_skcipher_decrypt(req);
+ ret = crypto_skcipher_decrypt(req);
skcipher_request_zero(req);
- if (ret < 0) {
- pr_err("ceph_aes_decrypt failed %d\n", ret);
- goto out_sg;
- }
-
- if (src_len <= *dst1_len)
- last_byte = ((char *)dst1)[src_len - 1];
- else if (src_len <= *dst1_len + *dst2_len)
- last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
- else
- last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
- if (last_byte <= 16 && src_len >= last_byte) {
- src_len -= last_byte;
- } else {
- pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
- last_byte, (int)src_len);
- return -EPERM; /* bad padding */
- }
-
- if (src_len < *dst1_len) {
- *dst1_len = src_len;
- *dst2_len = 0;
- } else {
- *dst2_len = src_len - *dst1_len;
+ if (ret) {
+ pr_err("%s %scrypt failed: %d\n", __func__,
+ encrypt ? "en" : "de", ret);
+ goto out_sgt;
}
/*
- print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
- dst1, *dst1_len, 1);
- print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
- dst2, *dst2_len, 1);
+ print_hex_dump(KERN_ERR, "out: ", DUMP_PREFIX_NONE, 16, 1,
+ buf, crypt_len, 1);
*/
-out_sg:
- teardown_sgtable(&sg_in);
-out_tfm:
- crypto_free_skcipher(tfm);
- return ret;
-}
-
-
-int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
- const void *src, size_t src_len)
-{
- switch (secret->type) {
- case CEPH_CRYPTO_NONE:
- if (*dst_len < src_len)
- return -ERANGE;
- memcpy(dst, src, src_len);
- *dst_len = src_len;
- return 0;
-
- case CEPH_CRYPTO_AES:
- return ceph_aes_decrypt(secret->key, secret->len, dst,
- dst_len, src, src_len);
-
- default:
- return -EINVAL;
- }
-}
-
-int ceph_decrypt2(struct ceph_crypto_key *secret,
- void *dst1, size_t *dst1_len,
- void *dst2, size_t *dst2_len,
- const void *src, size_t src_len)
-{
- size_t t;
-
- switch (secret->type) {
- case CEPH_CRYPTO_NONE:
- if (*dst1_len + *dst2_len < src_len)
- return -ERANGE;
- t = min(*dst1_len, src_len);
- memcpy(dst1, src, t);
- *dst1_len = t;
- src += t;
- src_len -= t;
- if (src_len) {
- t = min(*dst2_len, src_len);
- memcpy(dst2, src, t);
- *dst2_len = t;
+ if (encrypt) {
+ *pout_len = crypt_len;
+ } else {
+ pad_byte = *(char *)(buf + in_len - 1);
+ if (pad_byte > 0 && pad_byte <= AES_BLOCK_SIZE &&
+ in_len >= pad_byte) {
+ *pout_len = in_len - pad_byte;
+ } else {
+ pr_err("%s got bad padding %d on in_len %d\n",
+ __func__, pad_byte, in_len);
+ ret = -EPERM;
+ goto out_sgt;
}
- return 0;
-
- case CEPH_CRYPTO_AES:
- return ceph_aes_decrypt2(secret->key, secret->len,
- dst1, dst1_len, dst2, dst2_len,
- src, src_len);
-
- default:
- return -EINVAL;
}
-}
-
-int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
- const void *src, size_t src_len)
-{
- switch (secret->type) {
- case CEPH_CRYPTO_NONE:
- if (*dst_len < src_len)
- return -ERANGE;
- memcpy(dst, src, src_len);
- *dst_len = src_len;
- return 0;
- case CEPH_CRYPTO_AES:
- return ceph_aes_encrypt(secret->key, secret->len, dst,
- dst_len, src, src_len);
-
- default:
- return -EINVAL;
- }
+out_sgt:
+ teardown_sgtable(&sgt);
+ return ret;
}
-int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
- const void *src1, size_t src1_len,
- const void *src2, size_t src2_len)
+int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt,
+ void *buf, int buf_len, int in_len, int *pout_len)
{
- switch (secret->type) {
+ switch (key->type) {
case CEPH_CRYPTO_NONE:
- if (*dst_len < src1_len + src2_len)
- return -ERANGE;
- memcpy(dst, src1, src1_len);
- memcpy(dst + src1_len, src2, src2_len);
- *dst_len = src1_len + src2_len;
+ *pout_len = in_len;
return 0;
-
case CEPH_CRYPTO_AES:
- return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
- src1, src1_len, src2, src2_len);
-
+ return ceph_aes_crypt(key, encrypt, buf, buf_len, in_len,
+ pout_len);
default:
- return -EINVAL;
+ return -ENOTSUPP;
}
}
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
index 2e9cab09f37b..58d83aa7740f 100644
--- a/net/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -12,37 +12,19 @@ struct ceph_crypto_key {
struct ceph_timespec created;
int len;
void *key;
+ struct crypto_skcipher *tfm;
};
-static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
-{
- if (key) {
- kfree(key->key);
- key->key = NULL;
- }
-}
-
int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
const struct ceph_crypto_key *src);
int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end);
int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end);
int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
+void ceph_crypto_key_destroy(struct ceph_crypto_key *key);
/* crypto.c */
-int ceph_decrypt(struct ceph_crypto_key *secret,
- void *dst, size_t *dst_len,
- const void *src, size_t src_len);
-int ceph_encrypt(struct ceph_crypto_key *secret,
- void *dst, size_t *dst_len,
- const void *src, size_t src_len);
-int ceph_decrypt2(struct ceph_crypto_key *secret,
- void *dst1, size_t *dst1_len,
- void *dst2, size_t *dst2_len,
- const void *src, size_t src_len);
-int ceph_encrypt2(struct ceph_crypto_key *secret,
- void *dst, size_t *dst_len,
- const void *src1, size_t src1_len,
- const void *src2, size_t src2_len);
+int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt,
+ void *buf, int buf_len, int in_len, int *pout_len);
int ceph_crypto_init(void);
void ceph_crypto_shutdown(void);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index a5502898ea33..f76bb3332613 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -7,6 +7,7 @@
#include <linux/kthread.h>
#include <linux/net.h>
#include <linux/nsproxy.h>
+#include <linux/sched/mm.h>
#include <linux/slab.h>
#include <linux/socket.h>
#include <linux/string.h>
@@ -469,11 +470,16 @@ static int ceph_tcp_connect(struct ceph_connection *con)
{
struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
struct socket *sock;
+ unsigned int noio_flag;
int ret;
BUG_ON(con->sock);
+
+ /* sock_create_kern() allocates with GFP_KERNEL */
+ noio_flag = memalloc_noio_save();
ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family,
SOCK_STREAM, IPPROTO_TCP, &sock);
+ memalloc_noio_restore(noio_flag);
if (ret)
return ret;
sock->sk->sk_allocation = GFP_NOFS;
@@ -520,7 +526,8 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
int r;
- r = kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+ iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, len);
+ r = sock_recvmsg(sock, &msg, msg.msg_flags);
if (r == -EAGAIN)
r = 0;
return r;
@@ -529,17 +536,20 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
int page_offset, size_t length)
{
- void *kaddr;
- int ret;
+ struct bio_vec bvec = {
+ .bv_page = page,
+ .bv_offset = page_offset,
+ .bv_len = length
+ };
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int r;
BUG_ON(page_offset + length > PAGE_SIZE);
-
- kaddr = kmap(page);
- BUG_ON(!kaddr);
- ret = ceph_tcp_recvmsg(sock, kaddr + page_offset, length);
- kunmap(page);
-
- return ret;
+ iov_iter_bvec(&msg.msg_iter, READ | ITER_BVEC, &bvec, 1, length);
+ r = sock_recvmsg(sock, &msg, msg.msg_flags);
+ if (r == -EAGAIN)
+ r = 0;
+ return r;
}
/*
@@ -579,18 +589,28 @@ static int __ceph_tcp_sendpage(struct socket *sock, struct page *page,
static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
int offset, size_t size, bool more)
{
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ struct bio_vec bvec;
int ret;
- struct kvec iov;
/* sendpage cannot properly handle pages with page_count == 0,
* we need to fallback to sendmsg if that's the case */
if (page_count(page) >= 1)
return __ceph_tcp_sendpage(sock, page, offset, size, more);
- iov.iov_base = kmap(page) + offset;
- iov.iov_len = size;
- ret = ceph_tcp_sendmsg(sock, &iov, 1, size, more);
- kunmap(page);
+ bvec.bv_page = page;
+ bvec.bv_offset = offset;
+ bvec.bv_len = size;
+
+ if (more)
+ msg.msg_flags |= MSG_MORE;
+ else
+ msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
+
+ iov_iter_bvec(&msg.msg_iter, WRITE | ITER_BVEC, &bvec, 1, size);
+ ret = sock_sendmsg(sock, &msg);
+ if (ret == -EAGAIN)
+ ret = 0;
return ret;
}
@@ -1393,15 +1413,9 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection
return NULL;
}
- /* Can't hold the mutex while getting authorizer */
- mutex_unlock(&con->mutex);
auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry);
- mutex_lock(&con->mutex);
-
if (IS_ERR(auth))
return auth;
- if (con->state != CON_STATE_NEGOTIATING)
- return ERR_PTR(-EAGAIN);
con->auth_reply_buf = auth->authorizer_reply_buf;
con->auth_reply_buf_len = auth->authorizer_reply_buf_len;
@@ -2027,6 +2041,19 @@ static int process_connect(struct ceph_connection *con)
dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+ if (con->auth_reply_buf) {
+ /*
+ * Any connection that defines ->get_authorizer()
+ * should also define ->verify_authorizer_reply().
+ * See get_connect_authorizer().
+ */
+ ret = con->ops->verify_authorizer_reply(con);
+ if (ret < 0) {
+ con->error_msg = "bad authorize reply";
+ return ret;
+ }
+ }
+
switch (con->in_reply.tag) {
case CEPH_MSGR_TAG_FEATURES:
pr_err("%s%lld %s feature set mismatch,"
@@ -3418,7 +3445,7 @@ static void ceph_msg_release(struct kref *kref)
struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
{
dout("%s %p (was %d)\n", __func__, msg,
- atomic_read(&msg->kref.refcount));
+ kref_read(&msg->kref));
kref_get(&msg->kref);
return msg;
}
@@ -3427,7 +3454,7 @@ EXPORT_SYMBOL(ceph_msg_get);
void ceph_msg_put(struct ceph_msg *msg)
{
dout("%s %p (was %d)\n", __func__, msg,
- atomic_read(&msg->kref.refcount));
+ kref_read(&msg->kref));
kref_put(&msg->kref, ceph_msg_release);
}
EXPORT_SYMBOL(ceph_msg_put);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index a8effc8b7280..29a0ef351c5e 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1028,21 +1028,21 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
err = -ENOMEM;
monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
sizeof(struct ceph_mon_subscribe_ack),
- GFP_NOFS, true);
+ GFP_KERNEL, true);
if (!monc->m_subscribe_ack)
goto out_auth;
- monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS,
- true);
+ monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128,
+ GFP_KERNEL, true);
if (!monc->m_subscribe)
goto out_subscribe_ack;
- monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS,
- true);
+ monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096,
+ GFP_KERNEL, true);
if (!monc->m_auth_reply)
goto out_subscribe;
- monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true);
+ monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_KERNEL, true);
monc->pending_auth = 0;
if (!monc->m_auth)
goto out_auth_reply;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index e6ae15bc41b7..e15ea9e4c495 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -438,7 +438,7 @@ static void ceph_osdc_release_request(struct kref *kref)
void ceph_osdc_get_request(struct ceph_osd_request *req)
{
dout("%s %p (was %d)\n", __func__, req,
- atomic_read(&req->r_kref.refcount));
+ kref_read(&req->r_kref));
kref_get(&req->r_kref);
}
EXPORT_SYMBOL(ceph_osdc_get_request);
@@ -447,7 +447,7 @@ void ceph_osdc_put_request(struct ceph_osd_request *req)
{
if (req) {
dout("%s %p (was %d)\n", __func__, req,
- atomic_read(&req->r_kref.refcount));
+ kref_read(&req->r_kref));
kref_put(&req->r_kref, ceph_osdc_release_request);
}
}
@@ -460,7 +460,6 @@ static void request_init(struct ceph_osd_request *req)
kref_init(&req->r_kref);
init_completion(&req->r_completion);
- init_completion(&req->r_safe_completion);
RB_CLEAR_NODE(&req->r_node);
RB_CLEAR_NODE(&req->r_mc_node);
INIT_LIST_HEAD(&req->r_unsafe_item);
@@ -487,11 +486,11 @@ static void request_reinit(struct ceph_osd_request *req)
struct ceph_msg *reply_msg = req->r_reply;
dout("%s req %p\n", __func__, req);
- WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
+ WARN_ON(kref_read(&req->r_kref) != 1);
request_release_checks(req);
- WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
- WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
+ WARN_ON(kref_read(&request_msg->kref) != 1);
+ WARN_ON(kref_read(&reply_msg->kref) != 1);
target_destroy(&req->r_t);
request_init(req);
@@ -672,7 +671,8 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
BUG_ON(length > previous);
op->extent.length = length;
- op->indata_len -= previous - length;
+ if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
+ op->indata_len -= previous - length;
}
EXPORT_SYMBOL(osd_req_op_extent_update);
@@ -1636,7 +1636,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
bool need_send = false;
bool promoted = false;
- WARN_ON(req->r_tid || req->r_got_reply);
+ WARN_ON(req->r_tid);
dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
again:
@@ -1704,18 +1704,13 @@ promote:
static void account_request(struct ceph_osd_request *req)
{
- unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
-
- if (req->r_flags & CEPH_OSD_FLAG_READ) {
- WARN_ON(req->r_flags & mask);
- req->r_flags |= CEPH_OSD_FLAG_ACK;
- } else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
- WARN_ON(!(req->r_flags & mask));
- else
- WARN_ON(1);
+ WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
+ WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)));
- WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+ req->r_flags |= CEPH_OSD_FLAG_ONDISK;
atomic_inc(&req->r_osdc->num_requests);
+
+ req->r_start_stamp = jiffies;
}
static void submit_request(struct ceph_osd_request *req, bool wrlocked)
@@ -1725,7 +1720,7 @@ static void submit_request(struct ceph_osd_request *req, bool wrlocked)
__submit_request(req, wrlocked);
}
-static void __finish_request(struct ceph_osd_request *req)
+static void finish_request(struct ceph_osd_request *req)
{
struct ceph_osd_client *osdc = req->r_osdc;
struct ceph_osd *osd = req->r_osd;
@@ -1747,32 +1742,26 @@ static void __finish_request(struct ceph_osd_request *req)
ceph_msg_revoke_incoming(req->r_reply);
}
-static void finish_request(struct ceph_osd_request *req)
-{
- __finish_request(req);
- ceph_osdc_put_request(req);
-}
-
static void __complete_request(struct ceph_osd_request *req)
{
- if (req->r_callback)
+ if (req->r_callback) {
+ dout("%s req %p tid %llu cb %pf result %d\n", __func__, req,
+ req->r_tid, req->r_callback, req->r_result);
req->r_callback(req);
- else
- complete_all(&req->r_completion);
+ }
}
/*
- * Note that this is open-coded in handle_reply(), which has to deal
- * with ack vs commit, dup acks, etc.
+ * This is open-coded in handle_reply().
*/
static void complete_request(struct ceph_osd_request *req, int err)
{
dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
req->r_result = err;
- __finish_request(req);
+ finish_request(req);
__complete_request(req);
- complete_all(&req->r_safe_completion);
+ complete_all(&req->r_completion);
ceph_osdc_put_request(req);
}
@@ -1798,6 +1787,16 @@ static void cancel_request(struct ceph_osd_request *req)
cancel_map_check(req);
finish_request(req);
+ complete_all(&req->r_completion);
+ ceph_osdc_put_request(req);
+}
+
+static void abort_request(struct ceph_osd_request *req, int err)
+{
+ dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
+
+ cancel_map_check(req);
+ complete_request(req, err);
}
static void check_pool_dne(struct ceph_osd_request *req)
@@ -2173,7 +2172,6 @@ static void linger_commit_cb(struct ceph_osd_request *req)
mutex_lock(&lreq->lock);
dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
lreq->linger_id, req->r_result);
- WARN_ON(!__linger_registered(lreq));
linger_reg_commit_complete(lreq, req->r_result);
lreq->committed = true;
@@ -2499,6 +2497,7 @@ static void handle_timeout(struct work_struct *work)
container_of(work, struct ceph_osd_client, timeout_work.work);
struct ceph_options *opts = osdc->client->options;
unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
+ unsigned long expiry_cutoff = jiffies - opts->osd_request_timeout;
LIST_HEAD(slow_osds);
struct rb_node *n, *p;
@@ -2514,15 +2513,23 @@ static void handle_timeout(struct work_struct *work)
struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
bool found = false;
- for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+ for (p = rb_first(&osd->o_requests); p; ) {
struct ceph_osd_request *req =
rb_entry(p, struct ceph_osd_request, r_node);
+ p = rb_next(p); /* abort_request() */
+
if (time_before(req->r_stamp, cutoff)) {
dout(" req %p tid %llu on osd%d is laggy\n",
req, req->r_tid, osd->o_osd);
found = true;
}
+ if (opts->osd_request_timeout &&
+ time_before(req->r_start_stamp, expiry_cutoff)) {
+ pr_err_ratelimited("tid %llu on osd%d timeout\n",
+ req->r_tid, osd->o_osd);
+ abort_request(req, -ETIMEDOUT);
+ }
}
for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
struct ceph_osd_linger_request *lreq =
@@ -2542,6 +2549,21 @@ static void handle_timeout(struct work_struct *work)
list_move_tail(&osd->o_keepalive_item, &slow_osds);
}
+ if (opts->osd_request_timeout) {
+ for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
+ struct ceph_osd_request *req =
+ rb_entry(p, struct ceph_osd_request, r_node);
+
+ p = rb_next(p); /* abort_request() */
+
+ if (time_before(req->r_start_stamp, expiry_cutoff)) {
+ pr_err_ratelimited("tid %llu on osd%d timeout\n",
+ req->r_tid, osdc->homeless_osd.o_osd);
+ abort_request(req, -ETIMEDOUT);
+ }
+ }
+ }
+
if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
maybe_request_map(osdc);
@@ -2789,31 +2811,8 @@ e_inval:
}
/*
- * We are done with @req if
- * - @m is a safe reply, or
- * - @m is an unsafe reply and we didn't want a safe one
- */
-static bool done_request(const struct ceph_osd_request *req,
- const struct MOSDOpReply *m)
-{
- return (m->result < 0 ||
- (m->flags & CEPH_OSD_FLAG_ONDISK) ||
- !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
-}
-
-/*
- * handle osd op reply. either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- *
- * ->r_unsafe_callback is set? yes no
- *
- * first reply is OK (needed r_cb/r_completion, r_cb/r_completion,
- * any or needed/got safe) r_safe_completion r_safe_completion
- *
- * first reply is unsafe r_unsafe_cb(true) (nothing)
- *
- * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion,
- * r_safe_completion r_safe_completion
+ * Handle MOSDOpReply. Set ->r_result and call the callback if it is
+ * specified.
*/
static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
{
@@ -2822,7 +2821,6 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
struct MOSDOpReply m;
u64 tid = le64_to_cpu(msg->hdr.tid);
u32 data_len = 0;
- bool already_acked;
int ret;
int i;
@@ -2901,51 +2899,22 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
le32_to_cpu(msg->hdr.data_len), req->r_tid);
goto fail_request;
}
- dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
- req, req->r_tid, req->r_got_reply, m.result, data_len);
-
- already_acked = req->r_got_reply;
- if (!already_acked) {
- req->r_result = m.result ?: data_len;
- req->r_replay_version = m.replay_version; /* struct */
- req->r_got_reply = true;
- } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
- dout("req %p tid %llu dup ack\n", req, req->r_tid);
- goto out_unlock_session;
- }
-
- if (done_request(req, &m)) {
- __finish_request(req);
- if (req->r_linger) {
- WARN_ON(req->r_unsafe_callback);
- dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
- __complete_request(req);
- }
- }
+ dout("%s req %p tid %llu result %d data_len %u\n", __func__,
+ req, req->r_tid, m.result, data_len);
+ /*
+ * Since we only ever request ONDISK, we should only ever get
+ * one (type of) reply back.
+ */
+ WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
+ req->r_result = m.result ?: data_len;
+ finish_request(req);
mutex_unlock(&osd->lock);
up_read(&osdc->lock);
- if (done_request(req, &m)) {
- if (already_acked && req->r_unsafe_callback) {
- dout("req %p tid %llu safe-cb\n", req, req->r_tid);
- req->r_unsafe_callback(req, false);
- } else if (!req->r_linger) {
- dout("req %p tid %llu cb\n", req, req->r_tid);
- __complete_request(req);
- }
- if (m.flags & CEPH_OSD_FLAG_ONDISK)
- complete_all(&req->r_safe_completion);
- ceph_osdc_put_request(req);
- } else {
- if (req->r_unsafe_callback) {
- dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
- req->r_unsafe_callback(req, true);
- } else {
- WARN_ON(1);
- }
- }
-
+ __complete_request(req);
+ complete_all(&req->r_completion);
+ ceph_osdc_put_request(req);
return;
fail_request:
@@ -3471,9 +3440,8 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
EXPORT_SYMBOL(ceph_osdc_start_request);
/*
- * Unregister a registered request. The request is not completed (i.e.
- * no callbacks or wakeups) - higher layers are supposed to know what
- * they are canceling.
+ * Unregister a registered request. The request is not completed:
+ * ->r_result isn't set and __complete_request() isn't called.
*/
void ceph_osdc_cancel_request(struct ceph_osd_request *req)
{
@@ -3500,9 +3468,6 @@ static int wait_request_timeout(struct ceph_osd_request *req,
if (left <= 0) {
left = left ?: -ETIMEDOUT;
ceph_osdc_cancel_request(req);
-
- /* kludge - need to to wake ceph_osdc_sync() */
- complete_all(&req->r_safe_completion);
} else {
left = req->r_result; /* completed */
}
@@ -3549,7 +3514,7 @@ again:
up_read(&osdc->lock);
dout("%s waiting on req %p tid %llu last_tid %llu\n",
__func__, req, req->r_tid, last_tid);
- wait_for_completion(&req->r_safe_completion);
+ wait_for_completion(&req->r_completion);
ceph_osdc_put_request(req);
goto again;
}
@@ -3608,7 +3573,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
ceph_oid_copy(&lreq->t.base_oid, oid);
ceph_oloc_copy(&lreq->t.base_oloc, oloc);
- lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+ lreq->t.flags = CEPH_OSD_FLAG_WRITE;
lreq->mtime = CURRENT_TIME;
lreq->reg_req = alloc_linger_request(lreq);
@@ -3666,7 +3631,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
- req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+ req->r_flags = CEPH_OSD_FLAG_WRITE;
req->r_mtime = CURRENT_TIME;
osd_req_op_watch_init(req, 0, lreq->linger_id,
CEPH_OSD_WATCH_OP_UNWATCH);
@@ -4031,7 +3996,7 @@ EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
* Execute an OSD class method on an object.
*
* @flags: CEPH_OSD_FLAG_*
- * @resp_len: out param for reply length
+ * @resp_len: in/out param for reply length
*/
int ceph_osdc_call(struct ceph_osd_client *osdc,
struct ceph_object_id *oid,
@@ -4044,6 +4009,9 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
struct ceph_osd_request *req;
int ret;
+ if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE))
+ return -E2BIG;
+
req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
if (!req)
return -ENOMEM;
@@ -4062,7 +4030,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
0, false, false);
if (resp_page)
osd_req_op_cls_response_data_pages(req, 0, &resp_page,
- PAGE_SIZE, 0, false, false);
+ *resp_len, 0, false, false);
ceph_osdc_start_request(osdc, req, false);
ret = ceph_osdc_wait_request(osdc, req);
@@ -4229,8 +4197,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
int page_align = off & ~PAGE_MASK;
req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
- CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
snapc, truncate_seq, truncate_size,
true);
if (IS_ERR(req))
@@ -4478,13 +4445,13 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
}
-static int verify_authorizer_reply(struct ceph_connection *con, int len)
+static int verify_authorizer_reply(struct ceph_connection *con)
{
struct ceph_osd *o = con->private;
struct ceph_osd_client *osdc = o->o_osdc;
struct ceph_auth_client *ac = osdc->client->monc.auth;
- return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len);
+ return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer);
}
static int invalidate_authorizer(struct ceph_connection *con)
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index d2436880b305..ffe9e904d4d1 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -153,6 +153,32 @@ bad:
return -EINVAL;
}
+static void crush_finalize(struct crush_map *c)
+{
+ __s32 b;
+
+ /* Space for the array of pointers to per-bucket workspace */
+ c->working_size = sizeof(struct crush_work) +
+ c->max_buckets * sizeof(struct crush_work_bucket *);
+
+ for (b = 0; b < c->max_buckets; b++) {
+ if (!c->buckets[b])
+ continue;
+
+ switch (c->buckets[b]->alg) {
+ default:
+ /*
+ * The base case, permutation variables and
+ * the pointer to the permutation array.
+ */
+ c->working_size += sizeof(struct crush_work_bucket);
+ break;
+ }
+ /* Every bucket has a permutation array. */
+ c->working_size += c->buckets[b]->size * sizeof(__u32);
+ }
+}
+
static struct crush_map *crush_decode(void *pbyval, void *end)
{
struct crush_map *c;
@@ -246,10 +272,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
if (b->items == NULL)
goto badmem;
- b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
- if (b->perm == NULL)
- goto badmem;
- b->perm_n = 0;
ceph_decode_need(p, end, b->size*sizeof(u32), bad);
for (j = 0; j < b->size; j++)
@@ -369,6 +391,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
c->chooseleaf_stable);
done:
+ crush_finalize(c);
dout("crush_decode success\n");
return c;
@@ -719,7 +742,7 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
map->pool_max = -1;
map->pg_temp = RB_ROOT;
map->primary_temp = RB_ROOT;
- mutex_init(&map->crush_scratch_mutex);
+ mutex_init(&map->crush_workspace_mutex);
return map;
}
@@ -753,6 +776,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
kfree(map->osd_weight);
kfree(map->osd_addr);
kfree(map->osd_primary_affinity);
+ kfree(map->crush_workspace);
kfree(map);
}
@@ -808,6 +832,31 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
return 0;
}
+static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
+{
+ void *workspace;
+ size_t work_size;
+
+ if (IS_ERR(crush))
+ return PTR_ERR(crush);
+
+ work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
+ dout("%s work_size %zu bytes\n", __func__, work_size);
+ workspace = kmalloc(work_size, GFP_NOIO);
+ if (!workspace) {
+ crush_destroy(crush);
+ return -ENOMEM;
+ }
+ crush_init_workspace(crush, workspace);
+
+ if (map->crush)
+ crush_destroy(map->crush);
+ kfree(map->crush_workspace);
+ map->crush = crush;
+ map->crush_workspace = workspace;
+ return 0;
+}
+
#define OSDMAP_WRAPPER_COMPAT_VER 7
#define OSDMAP_CLIENT_DATA_COMPAT_VER 1
@@ -1214,13 +1263,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
/* crush */
ceph_decode_32_safe(p, end, len, e_inval);
- map->crush = crush_decode(*p, min(*p + len, end));
- if (IS_ERR(map->crush)) {
- err = PTR_ERR(map->crush);
- map->crush = NULL;
+ err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end)));
+ if (err)
goto bad;
- }
- *p += len;
/* ignore the rest */
*p = end;
@@ -1334,7 +1379,6 @@ static int decode_new_up_state_weight(void **p, void *end,
if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
(xorstate & CEPH_OSD_EXISTS)) {
pr_info("osd%d does not exist\n", osd);
- map->osd_weight[osd] = CEPH_OSD_IN;
ret = set_primary_affinity(map, osd,
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
if (ret)
@@ -1375,7 +1419,6 @@ e_inval:
struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
struct ceph_osdmap *map)
{
- struct crush_map *newcrush = NULL;
struct ceph_fsid fsid;
u32 epoch = 0;
struct ceph_timespec modified;
@@ -1414,12 +1457,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
/* new crush? */
ceph_decode_32_safe(p, end, len, e_inval);
if (len > 0) {
- newcrush = crush_decode(*p, min(*p+len, end));
- if (IS_ERR(newcrush)) {
- err = PTR_ERR(newcrush);
- newcrush = NULL;
+ err = osdmap_set_crush(map,
+ crush_decode(*p, min(*p + len, end)));
+ if (err)
goto bad;
- }
*p += len;
}
@@ -1439,12 +1480,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
map->epoch++;
map->modified = modified;
- if (newcrush) {
- if (map->crush)
- crush_destroy(map->crush);
- map->crush = newcrush;
- newcrush = NULL;
- }
/* new_pools */
err = decode_new_pools(p, end, map);
@@ -1505,8 +1540,6 @@ bad:
print_hex_dump(KERN_DEBUG, "osdmap: ",
DUMP_PREFIX_OFFSET, 16, 1,
start, end - start, true);
- if (newcrush)
- crush_destroy(newcrush);
return ERR_PTR(err);
}
@@ -1942,10 +1975,10 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
BUG_ON(result_max > CEPH_PG_MAX_SIZE);
- mutex_lock(&map->crush_scratch_mutex);
+ mutex_lock(&map->crush_workspace_mutex);
r = crush_do_rule(map->crush, ruleno, x, result, result_max,
- weight, weight_max, map->crush_scratch_ary);
- mutex_unlock(&map->crush_scratch_mutex);
+ weight, weight_max, map->crush_workspace);
+ mutex_unlock(&map->crush_workspace_mutex);
return r;
}
@@ -1978,8 +2011,14 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
return;
}
- len = do_crush(osdmap, ruleno, pps, raw->osds,
- min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
+ if (pi->size > ARRAY_SIZE(raw->osds)) {
+ pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n",
+ pi->id, pi->crush_ruleset, pi->type, pi->size,
+ ARRAY_SIZE(raw->osds));
+ return;
+ }
+
+ len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size,
osdmap->osd_weight, osdmap->max_osd);
if (len < 0) {
pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c
index 154683f5f14c..705414e78ae0 100644
--- a/net/ceph/snapshot.c
+++ b/net/ceph/snapshot.c
@@ -18,8 +18,6 @@
* 02110-1301, USA.
*/
-#include <stddef.h>
-
#include <linux/types.h>
#include <linux/export.h>
#include <linux/ceph/libceph.h>
diff --git a/net/compat.c b/net/compat.c
index 1cd2ec046164..aba929e5250f 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -22,13 +22,14 @@
#include <linux/filter.h>
#include <linux/compat.h>
#include <linux/security.h>
+#include <linux/audit.h>
#include <linux/export.h>
#include <net/scm.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/ipv6.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/compat.h>
int get_compat_msghdr(struct msghdr *kmsg,
@@ -90,11 +91,11 @@ int get_compat_msghdr(struct msghdr *kmsg,
#define CMSG_COMPAT_ALIGN(len) ALIGN((len), sizeof(s32))
#define CMSG_COMPAT_DATA(cmsg) \
- ((void __user *)((char __user *)(cmsg) + CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr))))
+ ((void __user *)((char __user *)(cmsg) + sizeof(struct compat_cmsghdr)))
#define CMSG_COMPAT_SPACE(len) \
- (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + CMSG_COMPAT_ALIGN(len))
+ (sizeof(struct compat_cmsghdr) + CMSG_COMPAT_ALIGN(len))
#define CMSG_COMPAT_LEN(len) \
- (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + (len))
+ (sizeof(struct compat_cmsghdr) + (len))
#define CMSG_COMPAT_FIRSTHDR(msg) \
(((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ? \
@@ -130,6 +131,9 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
__kernel_size_t kcmlen, tmp;
int err = -EFAULT;
+ BUILD_BUG_ON(sizeof(struct compat_cmsghdr) !=
+ CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)));
+
kcmlen = 0;
kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf;
ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg);
@@ -141,8 +145,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
return -EINVAL;
- tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
- CMSG_ALIGN(sizeof(struct cmsghdr)));
+ tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
tmp = CMSG_ALIGN(tmp);
kcmlen += tmp;
ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
@@ -168,8 +171,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
goto Efault;
if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
goto Einval;
- tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
- CMSG_ALIGN(sizeof(struct cmsghdr)));
+ tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp))
goto Einval;
kcmsg->cmsg_len = tmp;
@@ -178,7 +180,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
__get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) ||
copy_from_user(CMSG_DATA(kcmsg),
CMSG_COMPAT_DATA(ucmsg),
- (ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg)))))
+ (ucmlen - sizeof(*ucmsg))))
goto Efault;
/* Advance. */
@@ -781,14 +783,24 @@ COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
{
- int ret;
- u32 a[6];
+ u32 a[AUDITSC_ARGS];
+ unsigned int len;
u32 a0, a1;
+ int ret;
if (call < SYS_SOCKET || call > SYS_SENDMMSG)
return -EINVAL;
- if (copy_from_user(a, args, nas[call]))
+ len = nas[call];
+ if (len > sizeof(a))
+ return -EINVAL;
+
+ if (copy_from_user(a, args, len))
return -EFAULT;
+
+ ret = audit_socketcall_compat(len / sizeof(a[0]), a);
+ if (ret)
+ return ret;
+
a0 = a[0];
a1 = a[1];
diff --git a/net/core/Makefile b/net/core/Makefile
index d6508c2ddca5..79f9479e9658 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -24,6 +24,8 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
+obj-$(CONFIG_GRO_CELLS) += gro_cells.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index b7de71f8d5d3..f4947e737f34 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -36,7 +36,7 @@
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/errno.h>
@@ -165,6 +165,7 @@ done:
* __skb_try_recv_datagram - Receive a datagram skbuff
* @sk: socket
* @flags: MSG_ flags
+ * @destructor: invoked under the receive lock on successful dequeue
* @peeked: returns non-zero if this packet has been seen before
* @off: an offset in bytes to peek skb from. Returns an offset
* within an skb where data actually starts
@@ -197,6 +198,8 @@ done:
* the standard around please.
*/
struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb),
int *peeked, int *off, int *err,
struct sk_buff **last)
{
@@ -211,6 +214,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
if (error)
goto no_packet;
+ *peeked = 0;
do {
/* Again only user level code calls this function, so nothing
* interrupt level will suddenly eat the receive_queue.
@@ -224,26 +228,28 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
spin_lock_irqsave(&queue->lock, cpu_flags);
skb_queue_walk(queue, skb) {
*last = skb;
- *peeked = skb->peeked;
if (flags & MSG_PEEK) {
if (_off >= skb->len && (skb->len || _off ||
skb->peeked)) {
_off -= skb->len;
continue;
}
-
- skb = skb_set_peeked(skb);
- error = PTR_ERR(skb);
- if (IS_ERR(skb)) {
- spin_unlock_irqrestore(&queue->lock,
- cpu_flags);
- goto no_packet;
+ if (!skb->len) {
+ skb = skb_set_peeked(skb);
+ if (IS_ERR(skb)) {
+ error = PTR_ERR(skb);
+ spin_unlock_irqrestore(&queue->lock,
+ cpu_flags);
+ goto no_packet;
+ }
}
-
+ *peeked = 1;
atomic_inc(&skb->users);
- } else
+ } else {
__skb_unlink(skb, queue);
-
+ if (destructor)
+ destructor(sk, skb);
+ }
spin_unlock_irqrestore(&queue->lock, cpu_flags);
*off = _off;
return skb;
@@ -262,6 +268,8 @@ no_packet:
EXPORT_SYMBOL(__skb_try_recv_datagram);
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb),
int *peeked, int *off, int *err)
{
struct sk_buff *skb, *last;
@@ -270,8 +278,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
- skb = __skb_try_recv_datagram(sk, flags, peeked, off, err,
- &last);
+ skb = __skb_try_recv_datagram(sk, flags, destructor, peeked,
+ off, err, &last);
if (skb)
return skb;
@@ -290,7 +298,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
int peeked, off = 0;
return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
- &peeked, &off, err);
+ NULL, &peeked, &off, err);
}
EXPORT_SYMBOL(skb_recv_datagram);
@@ -323,6 +331,31 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
}
EXPORT_SYMBOL(__skb_free_datagram_locked);
+int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
+ unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb))
+{
+ int err = 0;
+
+ if (flags & MSG_PEEK) {
+ err = -ENOENT;
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ if (skb == skb_peek(&sk->sk_receive_queue)) {
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ atomic_dec(&skb->users);
+ if (destructor)
+ destructor(sk, skb);
+ err = 0;
+ }
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ }
+
+ atomic_inc(&sk->sk_drops);
+ return err;
+}
+EXPORT_SYMBOL(__sk_queue_drop_skb);
+
/**
* skb_kill_datagram - Free a datagram skbuff forcibly
* @sk: socket
@@ -346,23 +379,10 @@ EXPORT_SYMBOL(__skb_free_datagram_locked);
int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
{
- int err = 0;
-
- if (flags & MSG_PEEK) {
- err = -ENOENT;
- spin_lock_bh(&sk->sk_receive_queue.lock);
- if (skb == skb_peek(&sk->sk_receive_queue)) {
- __skb_unlink(skb, &sk->sk_receive_queue);
- atomic_dec(&skb->users);
- err = 0;
- }
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- }
+ int err = __sk_queue_drop_skb(sk, skb, flags, NULL);
kfree_skb(skb);
- atomic_inc(&sk->sk_drops);
sk_mem_reclaim_partial(sk);
-
return err;
}
EXPORT_SYMBOL(skb_kill_datagram);
@@ -378,7 +398,7 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
struct iov_iter *to, int len)
{
int start = skb_headlen(skb);
- int i, copy = start - offset;
+ int i, copy = start - offset, start_off = offset, n;
struct sk_buff *frag_iter;
trace_skb_copy_datagram_iovec(skb, len);
@@ -387,11 +407,12 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
if (copy > 0) {
if (copy > len)
copy = len;
- if (copy_to_iter(skb->data + offset, copy, to) != copy)
+ n = copy_to_iter(skb->data + offset, copy, to);
+ offset += n;
+ if (n != copy)
goto short_copy;
if ((len -= copy) == 0)
return 0;
- offset += copy;
}
/* Copy paged appendix. Hmm... why does this look so complicated? */
@@ -405,13 +426,14 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
if ((copy = end - offset) > 0) {
if (copy > len)
copy = len;
- if (copy_page_to_iter(skb_frag_page(frag),
+ n = copy_page_to_iter(skb_frag_page(frag),
frag->page_offset + offset -
- start, copy, to) != copy)
+ start, copy, to);
+ offset += n;
+ if (n != copy)
goto short_copy;
if (!(len -= copy))
return 0;
- offset += copy;
}
start = end;
}
@@ -443,6 +465,7 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
*/
fault:
+ iov_iter_revert(to, offset - start_off);
return -EFAULT;
short_copy:
@@ -593,7 +616,7 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
__wsum *csump)
{
int start = skb_headlen(skb);
- int i, copy = start - offset;
+ int i, copy = start - offset, start_off = offset;
struct sk_buff *frag_iter;
int pos = 0;
int n;
@@ -603,11 +626,11 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to);
+ offset += n;
if (n != copy)
goto fault;
if ((len -= copy) == 0)
return 0;
- offset += copy;
pos = copy;
}
@@ -629,12 +652,12 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
offset - start, copy,
&csum2, to);
kunmap(page);
+ offset += n;
if (n != copy)
goto fault;
*csump = csum_block_add(*csump, csum2, pos);
if (!(len -= copy))
return 0;
- offset += copy;
pos += copy;
}
start = end;
@@ -667,6 +690,7 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
return 0;
fault:
+ iov_iter_revert(to, offset - start_off);
return -EFAULT;
}
@@ -751,6 +775,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
}
return 0;
csum_error:
+ iov_iter_revert(&msg->msg_iter, chunk);
return -EINVAL;
fault:
return -EFAULT;
diff --git a/net/core/dev.c b/net/core/dev.c
index 6666b28b6815..533a6d6f6092 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1,5 +1,5 @@
/*
- * NET3 Protocol independent device support routines.
+ * NET3 Protocol independent device support routines.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -7,7 +7,7 @@
* 2 of the License, or (at your option) any later version.
*
* Derived from the non IP parts of dev.c 1.0.19
- * Authors: Ross Biro
+ * Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
*
@@ -21,9 +21,9 @@
*
* Changes:
* D.J. Barrow : Fixed bug where dev->refcnt gets set
- * to 2 if register_netdev gets called
- * before net_dev_init & also removed a
- * few lines of code in the process.
+ * to 2 if register_netdev gets called
+ * before net_dev_init & also removed a
+ * few lines of code in the process.
* Alan Cox : device private ioctl copies fields back.
* Alan Cox : Transmit queue code does relevant
* stunts to keep the queue safe.
@@ -36,7 +36,7 @@
* Alan Cox : 100 backlog just doesn't cut it when
* you start doing multicast video 8)
* Alan Cox : Rewrote net_bh and list manager.
- * Alan Cox : Fix ETH_P_ALL echoback lengths.
+ * Alan Cox : Fix ETH_P_ALL echoback lengths.
* Alan Cox : Took out transmit every packet pass
* Saved a few bytes in the ioctl handler
* Alan Cox : Network driver sets packet type before
@@ -46,7 +46,7 @@
* Richard Kooijman: Timestamp fixes.
* Alan Cox : Wrong field in SIOCGIFDSTADDR
* Alan Cox : Device lock protection.
- * Alan Cox : Fixed nasty side effect of device close
+ * Alan Cox : Fixed nasty side effect of device close
* changes.
* Rudi Cilibrasi : Pass the right thing to
* set_mac_address()
@@ -67,12 +67,12 @@
* Paul Rusty Russell : SIOCSIFNAME
* Pekka Riikonen : Netdev boot-time settings code
* Andrew Morton : Make unregister_netdevice wait
- * indefinitely on dev->refcnt
- * J Hadi Salim : - Backlog queue sampling
+ * indefinitely on dev->refcnt
+ * J Hadi Salim : - Backlog queue sampling
* - netif_rx() feedback
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -139,7 +139,6 @@
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
#include <linux/netfilter_ingress.h>
-#include <linux/sctp.h>
#include <linux/crash_dump.h>
#include "net-sysfs.h"
@@ -193,7 +192,8 @@ static seqcount_t devnet_rename_seq;
static inline void dev_base_seq_inc(struct net *net)
{
- while (++net->dev_base_seq == 0);
+ while (++net->dev_base_seq == 0)
+ ;
}
static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
@@ -275,8 +275,8 @@ EXPORT_PER_CPU_SYMBOL(softnet_data);
* register_netdevice() inits txq->_xmit_lock and sets lockdep class
* according to dev->type
*/
-static const unsigned short netdev_lock_type[] =
- {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
+static const unsigned short netdev_lock_type[] = {
+ ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
@@ -292,22 +292,22 @@ static const unsigned short netdev_lock_type[] =
ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
-static const char *const netdev_lock_name[] =
- {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
- "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
- "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
- "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
- "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
- "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
- "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
- "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
- "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
- "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
- "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
- "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
- "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
- "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
- "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
+static const char *const netdev_lock_name[] = {
+ "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
+ "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
+ "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
+ "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
+ "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
+ "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
+ "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
+ "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
+ "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
+ "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
+ "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
+ "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
+ "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
+ "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
+ "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
@@ -353,10 +353,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
#endif
/*******************************************************************************
+ *
+ * Protocol management and registration routines
+ *
+ *******************************************************************************/
- Protocol management and registration routines
-
-*******************************************************************************/
/*
* Add a protocol ID to the list. Now that the input handler is
@@ -539,10 +540,10 @@ void dev_remove_offload(struct packet_offload *po)
EXPORT_SYMBOL(dev_remove_offload);
/******************************************************************************
-
- Device Boot-time Settings Routines
-
-*******************************************************************************/
+ *
+ * Device Boot-time Settings Routines
+ *
+ ******************************************************************************/
/* Boot time configuration table */
static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
@@ -575,13 +576,13 @@ static int netdev_boot_setup_add(char *name, struct ifmap *map)
}
/**
- * netdev_boot_setup_check - check boot time settings
- * @dev: the netdevice
+ * netdev_boot_setup_check - check boot time settings
+ * @dev: the netdevice
*
- * Check boot time settings for the device.
- * The found settings are set for the device to be used
- * later in the device probing.
- * Returns 0 if no settings found, 1 if they are.
+ * Check boot time settings for the device.
+ * The found settings are set for the device to be used
+ * later in the device probing.
+ * Returns 0 if no settings found, 1 if they are.
*/
int netdev_boot_setup_check(struct net_device *dev)
{
@@ -591,10 +592,10 @@ int netdev_boot_setup_check(struct net_device *dev)
for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
!strcmp(dev->name, s[i].name)) {
- dev->irq = s[i].map.irq;
- dev->base_addr = s[i].map.base_addr;
- dev->mem_start = s[i].map.mem_start;
- dev->mem_end = s[i].map.mem_end;
+ dev->irq = s[i].map.irq;
+ dev->base_addr = s[i].map.base_addr;
+ dev->mem_start = s[i].map.mem_start;
+ dev->mem_end = s[i].map.mem_end;
return 1;
}
}
@@ -604,14 +605,14 @@ EXPORT_SYMBOL(netdev_boot_setup_check);
/**
- * netdev_boot_base - get address from boot time settings
- * @prefix: prefix for network device
- * @unit: id for network device
+ * netdev_boot_base - get address from boot time settings
+ * @prefix: prefix for network device
+ * @unit: id for network device
*
- * Check boot time settings for the base address of device.
- * The found settings are set for the device to be used
- * later in the device probing.
- * Returns 0 if no settings found.
+ * Check boot time settings for the base address of device.
+ * The found settings are set for the device to be used
+ * later in the device probing.
+ * Returns 0 if no settings found.
*/
unsigned long netdev_boot_base(const char *prefix, int unit)
{
@@ -664,10 +665,10 @@ int __init netdev_boot_setup(char *str)
__setup("netdev=", netdev_boot_setup);
/*******************************************************************************
-
- Device Interface Subroutines
-
-*******************************************************************************/
+ *
+ * Device Interface Subroutines
+ *
+ *******************************************************************************/
/**
* dev_get_iflink - get 'iflink' value of a interface
@@ -738,15 +739,15 @@ struct net_device *__dev_get_by_name(struct net *net, const char *name)
EXPORT_SYMBOL(__dev_get_by_name);
/**
- * dev_get_by_name_rcu - find a device by its name
- * @net: the applicable net namespace
- * @name: name to find
+ * dev_get_by_name_rcu - find a device by its name
+ * @net: the applicable net namespace
+ * @name: name to find
*
- * Find an interface by name.
- * If the name is found a pointer to the device is returned.
- * If the name is not found then %NULL is returned.
- * The reference counters are not incremented so the caller must be
- * careful with locks. The caller must hold RCU lock.
+ * Find an interface by name.
+ * If the name is found a pointer to the device is returned.
+ * If the name is not found then %NULL is returned.
+ * The reference counters are not incremented so the caller must be
+ * careful with locks. The caller must hold RCU lock.
*/
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
@@ -1290,8 +1291,8 @@ void netdev_state_change(struct net_device *dev)
EXPORT_SYMBOL(netdev_state_change);
/**
- * netdev_notify_peers - notify network peers about existence of @dev
- * @dev: network device
+ * netdev_notify_peers - notify network peers about existence of @dev
+ * @dev: network device
*
* Generate traffic such that interested network peers are aware of
* @dev, such as by generating a gratuitous ARP. This may be used when
@@ -1303,6 +1304,7 @@ void netdev_notify_peers(struct net_device *dev)
{
rtnl_lock();
call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+ call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
rtnl_unlock();
}
EXPORT_SYMBOL(netdev_notify_peers);
@@ -1519,17 +1521,17 @@ static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
static int dev_boot_phase = 1;
/**
- * register_netdevice_notifier - register a network notifier block
- * @nb: notifier
+ * register_netdevice_notifier - register a network notifier block
+ * @nb: notifier
*
- * Register a notifier to be called when network device events occur.
- * The notifier passed is linked into the kernel structures and must
- * not be reused until it has been unregistered. A negative errno code
- * is returned on a failure.
+ * Register a notifier to be called when network device events occur.
+ * The notifier passed is linked into the kernel structures and must
+ * not be reused until it has been unregistered. A negative errno code
+ * is returned on a failure.
*
- * When registered all registration and up events are replayed
- * to the new notifier to allow device to have a race free
- * view of the network device list.
+ * When registered all registration and up events are replayed
+ * to the new notifier to allow device to have a race free
+ * view of the network device list.
*/
int register_netdevice_notifier(struct notifier_block *nb)
@@ -1586,17 +1588,17 @@ outroll:
EXPORT_SYMBOL(register_netdevice_notifier);
/**
- * unregister_netdevice_notifier - unregister a network notifier block
- * @nb: notifier
+ * unregister_netdevice_notifier - unregister a network notifier block
+ * @nb: notifier
*
- * Unregister a notifier previously registered by
- * register_netdevice_notifier(). The notifier is unlinked into the
- * kernel structures and may then be reused. A negative errno code
- * is returned on a failure.
+ * Unregister a notifier previously registered by
+ * register_netdevice_notifier(). The notifier is unlinked into the
+ * kernel structures and may then be reused. A negative errno code
+ * is returned on a failure.
*
- * After unregistering unregister and down device events are synthesized
- * for all devices on the device list to the removed notifier to remove
- * the need for special case cleanup code.
+ * After unregistering unregister and down device events are synthesized
+ * for all devices on the device list to the removed notifier to remove
+ * the need for special case cleanup code.
*/
int unregister_netdevice_notifier(struct notifier_block *nb)
@@ -1696,50 +1698,72 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue);
static struct static_key netstamp_needed __read_mostly;
#ifdef HAVE_JUMP_LABEL
-/* We are not allowed to call static_key_slow_dec() from irq context
- * If net_disable_timestamp() is called from irq context, defer the
- * static_key_slow_dec() calls.
- */
static atomic_t netstamp_needed_deferred;
+static atomic_t netstamp_wanted;
+static void netstamp_clear(struct work_struct *work)
+{
+ int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
+ int wanted;
+
+ wanted = atomic_add_return(deferred, &netstamp_wanted);
+ if (wanted > 0)
+ static_key_enable(&netstamp_needed);
+ else
+ static_key_disable(&netstamp_needed);
+}
+static DECLARE_WORK(netstamp_work, netstamp_clear);
#endif
void net_enable_timestamp(void)
{
#ifdef HAVE_JUMP_LABEL
- int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
+ int wanted;
- if (deferred) {
- while (--deferred)
- static_key_slow_dec(&netstamp_needed);
- return;
+ while (1) {
+ wanted = atomic_read(&netstamp_wanted);
+ if (wanted <= 0)
+ break;
+ if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
+ return;
}
-#endif
+ atomic_inc(&netstamp_needed_deferred);
+ schedule_work(&netstamp_work);
+#else
static_key_slow_inc(&netstamp_needed);
+#endif
}
EXPORT_SYMBOL(net_enable_timestamp);
void net_disable_timestamp(void)
{
#ifdef HAVE_JUMP_LABEL
- if (in_interrupt()) {
- atomic_inc(&netstamp_needed_deferred);
- return;
+ int wanted;
+
+ while (1) {
+ wanted = atomic_read(&netstamp_wanted);
+ if (wanted <= 1)
+ break;
+ if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
+ return;
}
-#endif
+ atomic_dec(&netstamp_needed_deferred);
+ schedule_work(&netstamp_work);
+#else
static_key_slow_dec(&netstamp_needed);
+#endif
}
EXPORT_SYMBOL(net_disable_timestamp);
static inline void net_timestamp_set(struct sk_buff *skb)
{
- skb->tstamp.tv64 = 0;
+ skb->tstamp = 0;
if (static_key_false(&netstamp_needed))
__net_timestamp(skb);
}
#define net_timestamp_check(COND, SKB) \
if (static_key_false(&netstamp_needed)) { \
- if ((COND) && !(SKB)->tstamp.tv64) \
+ if ((COND) && !(SKB)->tstamp) \
__net_timestamp(SKB); \
} \
@@ -1944,37 +1968,80 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq)
}
}
+int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
+{
+ if (dev->num_tc) {
+ struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+ int i;
+
+ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
+ if ((txq - tc->offset) < tc->count)
+ return i;
+ }
+
+ return -1;
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_XPS
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P) \
rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
-static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
- int cpu, u16 index)
+static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
+ int tci, u16 index)
{
struct xps_map *map = NULL;
int pos;
if (dev_maps)
- map = xmap_dereference(dev_maps->cpu_map[cpu]);
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
+ if (!map)
+ return false;
- for (pos = 0; map && pos < map->len; pos++) {
- if (map->queues[pos] == index) {
- if (map->len > 1) {
- map->queues[pos] = map->queues[--map->len];
- } else {
- RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
- kfree_rcu(map, rcu);
- map = NULL;
- }
+ for (pos = map->len; pos--;) {
+ if (map->queues[pos] != index)
+ continue;
+
+ if (map->len > 1) {
+ map->queues[pos] = map->queues[--map->len];
break;
}
+
+ RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
+ kfree_rcu(map, rcu);
+ return false;
}
- return map;
+ return true;
}
-static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
+static bool remove_xps_queue_cpu(struct net_device *dev,
+ struct xps_dev_maps *dev_maps,
+ int cpu, u16 offset, u16 count)
+{
+ int num_tc = dev->num_tc ? : 1;
+ bool active = false;
+ int tci;
+
+ for (tci = cpu * num_tc; num_tc--; tci++) {
+ int i, j;
+
+ for (i = count, j = offset; i--; j++) {
+ if (!remove_xps_queue(dev_maps, cpu, j))
+ break;
+ }
+
+ active |= i < 0;
+ }
+
+ return active;
+}
+
+static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
+ u16 count)
{
struct xps_dev_maps *dev_maps;
int cpu, i;
@@ -1986,21 +2053,16 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
if (!dev_maps)
goto out_no_maps;
- for_each_possible_cpu(cpu) {
- for (i = index; i < dev->num_tx_queues; i++) {
- if (!remove_xps_queue(dev_maps, cpu, i))
- break;
- }
- if (i == dev->num_tx_queues)
- active = true;
- }
+ for_each_possible_cpu(cpu)
+ active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
+ offset, count);
if (!active) {
RCU_INIT_POINTER(dev->xps_maps, NULL);
kfree_rcu(dev_maps, rcu);
}
- for (i = index; i < dev->num_tx_queues; i++)
+ for (i = offset + (count - 1); count--; i--)
netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
NUMA_NO_NODE);
@@ -2008,6 +2070,11 @@ out_no_maps:
mutex_unlock(&xps_map_mutex);
}
+static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
+{
+ netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
+}
+
static struct xps_map *expand_xps_map(struct xps_map *map,
int cpu, u16 index)
{
@@ -2047,20 +2114,28 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
u16 index)
{
struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
+ int i, cpu, tci, numa_node_id = -2;
+ int maps_sz, num_tc = 1, tc = 0;
struct xps_map *map, *new_map;
- int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
- int cpu, numa_node_id = -2;
bool active = false;
+ if (dev->num_tc) {
+ num_tc = dev->num_tc;
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0)
+ return -EINVAL;
+ }
+
+ maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
+ if (maps_sz < L1_CACHE_BYTES)
+ maps_sz = L1_CACHE_BYTES;
+
mutex_lock(&xps_map_mutex);
dev_maps = xmap_dereference(dev->xps_maps);
/* allocate memory for queue storage */
- for_each_online_cpu(cpu) {
- if (!cpumask_test_cpu(cpu, mask))
- continue;
-
+ for_each_cpu_and(cpu, cpu_online_mask, mask) {
if (!new_dev_maps)
new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
if (!new_dev_maps) {
@@ -2068,25 +2143,38 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
return -ENOMEM;
}
- map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
+ tci = cpu * num_tc + tc;
+ map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
NULL;
map = expand_xps_map(map, cpu, index);
if (!map)
goto error;
- RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
}
if (!new_dev_maps)
goto out_no_new_maps;
for_each_possible_cpu(cpu) {
+ /* copy maps belonging to foreign traffic classes */
+ for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ }
+
+ /* We need to explicitly update tci as prevous loop
+ * could break out early if dev_maps is NULL.
+ */
+ tci = cpu * num_tc + tc;
+
if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
/* add queue to CPU maps */
int pos = 0;
- map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
+ map = xmap_dereference(new_dev_maps->cpu_map[tci]);
while ((pos < map->len) && (map->queues[pos] != index))
pos++;
@@ -2100,26 +2188,36 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
#endif
} else if (dev_maps) {
/* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->cpu_map[cpu]);
- RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
}
+ /* copy maps belonging to foreign traffic classes */
+ for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ }
}
rcu_assign_pointer(dev->xps_maps, new_dev_maps);
/* Cleanup old maps */
- if (dev_maps) {
- for_each_possible_cpu(cpu) {
- new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
- map = xmap_dereference(dev_maps->cpu_map[cpu]);
+ if (!dev_maps)
+ goto out_no_old_maps;
+
+ for_each_possible_cpu(cpu) {
+ for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
+ new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
if (map && map != new_map)
kfree_rcu(map, rcu);
}
-
- kfree_rcu(dev_maps, rcu);
}
+ kfree_rcu(dev_maps, rcu);
+
+out_no_old_maps:
dev_maps = new_dev_maps;
active = true;
@@ -2134,11 +2232,12 @@ out_no_new_maps:
/* removes queue from unused CPUs */
for_each_possible_cpu(cpu) {
- if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
- continue;
-
- if (remove_xps_queue(dev_maps, cpu, index))
- active = true;
+ for (i = tc, tci = cpu * num_tc; i--; tci++)
+ active |= remove_xps_queue(dev_maps, tci, index);
+ if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
+ active |= remove_xps_queue(dev_maps, tci, index);
+ for (i = num_tc - tc, tci++; --i; tci++)
+ active |= remove_xps_queue(dev_maps, tci, index);
}
/* free map if not active */
@@ -2154,11 +2253,14 @@ out_no_maps:
error:
/* remove any maps that we added */
for_each_possible_cpu(cpu) {
- new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
- map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
- NULL;
- if (new_map && new_map != map)
- kfree(new_map);
+ for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
+ new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+ map = dev_maps ?
+ xmap_dereference(dev_maps->cpu_map[tci]) :
+ NULL;
+ if (new_map && new_map != map)
+ kfree(new_map);
+ }
}
mutex_unlock(&xps_map_mutex);
@@ -2169,6 +2271,44 @@ error:
EXPORT_SYMBOL(netif_set_xps_queue);
#endif
+void netdev_reset_tc(struct net_device *dev)
+{
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ dev->num_tc = 0;
+ memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
+ memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
+}
+EXPORT_SYMBOL(netdev_reset_tc);
+
+int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
+{
+ if (tc >= dev->num_tc)
+ return -EINVAL;
+
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues(dev, offset, count);
+#endif
+ dev->tc_to_txq[tc].count = count;
+ dev->tc_to_txq[tc].offset = offset;
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_tc_queue);
+
+int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
+{
+ if (num_tc > TC_MAX_QUEUE)
+ return -EINVAL;
+
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ dev->num_tc = num_tc;
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_num_tc);
+
/*
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
* greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -2293,28 +2433,6 @@ void netif_schedule_queue(struct netdev_queue *txq)
}
EXPORT_SYMBOL(netif_schedule_queue);
-/**
- * netif_wake_subqueue - allow sending packets on subqueue
- * @dev: network device
- * @queue_index: sub queue index
- *
- * Resume individual transmit queue of a device with multiple transmit queues.
- */
-void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
-{
- struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
-
- if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
- struct Qdisc *q;
-
- rcu_read_lock();
- q = rcu_dereference(txq->qdisc);
- __netif_schedule(q);
- rcu_read_unlock();
- }
-}
-EXPORT_SYMBOL(netif_wake_subqueue);
-
void netif_tx_wake_queue(struct netdev_queue *dev_queue)
{
if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
@@ -2408,6 +2526,7 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
if (dev->num_tc) {
u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+
qoffset = dev->tc_to_txq[tc].offset;
qcount = dev->tc_to_txq[tc].count;
}
@@ -2487,141 +2606,6 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
-/* skb_csum_offload_check - Driver helper function to determine if a device
- * with limited checksum offload capabilities is able to offload the checksum
- * for a given packet.
- *
- * Arguments:
- * skb - sk_buff for the packet in question
- * spec - contains the description of what device can offload
- * csum_encapped - returns true if the checksum being offloaded is
- * encpasulated. That is it is checksum for the transport header
- * in the inner headers.
- * checksum_help - when set indicates that helper function should
- * call skb_checksum_help if offload checks fail
- *
- * Returns:
- * true: Packet has passed the checksum checks and should be offloadable to
- * the device (a driver may still need to check for additional
- * restrictions of its device)
- * false: Checksum is not offloadable. If checksum_help was set then
- * skb_checksum_help was called to resolve checksum for non-GSO
- * packets and when IP protocol is not SCTP
- */
-bool __skb_csum_offload_chk(struct sk_buff *skb,
- const struct skb_csum_offl_spec *spec,
- bool *csum_encapped,
- bool csum_help)
-{
- struct iphdr *iph;
- struct ipv6hdr *ipv6;
- void *nhdr;
- int protocol;
- u8 ip_proto;
-
- if (skb->protocol == htons(ETH_P_8021Q) ||
- skb->protocol == htons(ETH_P_8021AD)) {
- if (!spec->vlan_okay)
- goto need_help;
- }
-
- /* We check whether the checksum refers to a transport layer checksum in
- * the outermost header or an encapsulated transport layer checksum that
- * corresponds to the inner headers of the skb. If the checksum is for
- * something else in the packet we need help.
- */
- if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
- /* Non-encapsulated checksum */
- protocol = eproto_to_ipproto(vlan_get_protocol(skb));
- nhdr = skb_network_header(skb);
- *csum_encapped = false;
- if (spec->no_not_encapped)
- goto need_help;
- } else if (skb->encapsulation && spec->encap_okay &&
- skb_checksum_start_offset(skb) ==
- skb_inner_transport_offset(skb)) {
- /* Encapsulated checksum */
- *csum_encapped = true;
- switch (skb->inner_protocol_type) {
- case ENCAP_TYPE_ETHER:
- protocol = eproto_to_ipproto(skb->inner_protocol);
- break;
- case ENCAP_TYPE_IPPROTO:
- protocol = skb->inner_protocol;
- break;
- }
- nhdr = skb_inner_network_header(skb);
- } else {
- goto need_help;
- }
-
- switch (protocol) {
- case IPPROTO_IP:
- if (!spec->ipv4_okay)
- goto need_help;
- iph = nhdr;
- ip_proto = iph->protocol;
- if (iph->ihl != 5 && !spec->ip_options_okay)
- goto need_help;
- break;
- case IPPROTO_IPV6:
- if (!spec->ipv6_okay)
- goto need_help;
- if (spec->no_encapped_ipv6 && *csum_encapped)
- goto need_help;
- ipv6 = nhdr;
- nhdr += sizeof(*ipv6);
- ip_proto = ipv6->nexthdr;
- break;
- default:
- goto need_help;
- }
-
-ip_proto_again:
- switch (ip_proto) {
- case IPPROTO_TCP:
- if (!spec->tcp_okay ||
- skb->csum_offset != offsetof(struct tcphdr, check))
- goto need_help;
- break;
- case IPPROTO_UDP:
- if (!spec->udp_okay ||
- skb->csum_offset != offsetof(struct udphdr, check))
- goto need_help;
- break;
- case IPPROTO_SCTP:
- if (!spec->sctp_okay ||
- skb->csum_offset != offsetof(struct sctphdr, checksum))
- goto cant_help;
- break;
- case NEXTHDR_HOP:
- case NEXTHDR_ROUTING:
- case NEXTHDR_DEST: {
- u8 *opthdr = nhdr;
-
- if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
- goto need_help;
-
- ip_proto = opthdr[0];
- nhdr += (opthdr[1] + 1) << 3;
-
- goto ip_proto_again;
- }
- default:
- goto need_help;
- }
-
- /* Passed the tests for offloading checksum */
- return true;
-
-need_help:
- if (csum_help && !skb_shinfo(skb)->gso_size)
- skb_checksum_help(skb);
-cant_help:
- return false;
-}
-EXPORT_SYMBOL(__skb_csum_offload_chk);
-
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
__be16 type = skb->protocol;
@@ -2679,9 +2663,10 @@ EXPORT_SYMBOL(skb_mac_gso_segment);
static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
{
if (tx_path)
- return skb->ip_summed != CHECKSUM_PARTIAL;
- else
- return skb->ip_summed == CHECKSUM_NONE;
+ return skb->ip_summed != CHECKSUM_PARTIAL &&
+ skb->ip_summed != CHECKSUM_NONE;
+
+ return skb->ip_summed == CHECKSUM_NONE;
}
/**
@@ -2700,11 +2685,12 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
netdev_features_t features, bool tx_path)
{
+ struct sk_buff *segs;
+
if (unlikely(skb_needs_check(skb, tx_path))) {
int err;
- skb_warn_bad_offload(skb);
-
+ /* We're going to init ->check field in TCP or UDP header */
err = skb_cow_head(skb, 0);
if (err < 0)
return ERR_PTR(err);
@@ -2732,7 +2718,12 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
skb_reset_mac_header(skb);
skb_reset_mac_len(skb);
- return skb_mac_gso_segment(skb, features);
+ segs = skb_mac_gso_segment(skb, features);
+
+ if (unlikely(skb_needs_check(skb, tx_path)))
+ skb_warn_bad_offload(skb);
+
+ return segs;
}
EXPORT_SYMBOL(__skb_gso_segment);
@@ -2757,9 +2748,11 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_HIGHMEM
int i;
+
if (!(dev->features & NETIF_F_HIGHDMA)) {
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
if (PageHighMem(skb_frag_page(frag)))
return 1;
}
@@ -2773,6 +2766,7 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
dma_addr_t addr = page_to_phys(skb_frag_page(frag));
+
if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
return 1;
}
@@ -2815,9 +2809,9 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
if (skb->ip_summed != CHECKSUM_NONE &&
!can_checksum_protocol(features, type)) {
features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
- } else if (illegal_highdma(skb->dev, skb)) {
- features &= ~NETIF_F_SG;
}
+ if (illegal_highdma(skb->dev, skb))
+ features &= ~NETIF_F_SG;
return features;
}
@@ -3173,9 +3167,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
if (!cl)
return skb;
- /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
- * earlier by the caller.
- */
+ /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
qdisc_bstats_cpu_update(cl->q, skb);
switch (tc_classify(skb, cl, &cl_res, false)) {
@@ -3216,8 +3208,14 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_maps);
if (dev_maps) {
- map = rcu_dereference(
- dev_maps->cpu_map[skb->sender_cpu - 1]);
+ unsigned int tci = skb->sender_cpu - 1;
+
+ if (dev->num_tc) {
+ tci *= dev->num_tc;
+ tci += netdev_get_prio_tc_map(dev, skb->priority);
+ }
+
+ map = rcu_dereference(dev_maps->cpu_map[tci]);
if (map) {
if (map->len == 1)
queue_index = map->queues[0];
@@ -3244,6 +3242,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
if (queue_index < 0 || skb->ooo_okay ||
queue_index >= dev->real_num_tx_queues) {
int new_index = get_xps_queue(dev, skb);
+
if (new_index < 0)
new_index = skb_tx_hash(dev, skb);
@@ -3273,6 +3272,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
if (dev->real_num_tx_queues != 1) {
const struct net_device_ops *ops = dev->netdev_ops;
+
if (ops->ndo_select_queue)
queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
__netdev_pick_tx);
@@ -3334,7 +3334,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
qdisc_pkt_len_init(skb);
#ifdef CONFIG_NET_CLS_ACT
- skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
+ skb->tc_at_ingress = 0;
# ifdef CONFIG_NET_EGRESS
if (static_key_false(&egress_needed)) {
skb = sch_handle_egress(skb, &rc, dev);
@@ -3361,16 +3361,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
}
/* The device has no queue. Common case for software devices:
- loopback, all the sorts of tunnels...
+ * loopback, all the sorts of tunnels...
- Really, it is unlikely that netif_tx_lock protection is necessary
- here. (f.e. loopback and IP tunnels are clean ignoring statistics
- counters.)
- However, it is possible, that they rely on protection
- made by us here.
+ * Really, it is unlikely that netif_tx_lock protection is necessary
+ * here. (f.e. loopback and IP tunnels are clean ignoring statistics
+ * counters.)
+ * However, it is possible, that they rely on protection
+ * made by us here.
- Check this and shot the lock. It is not prone from deadlocks.
- Either shot noqueue qdisc, it is even simpler 8)
+ * Check this and shot the lock. It is not prone from deadlocks.
+ *Either shot noqueue qdisc, it is even simpler 8)
*/
if (dev->flags & IFF_UP) {
int cpu = smp_processor_id(); /* ok because BHs are off */
@@ -3432,16 +3432,20 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
EXPORT_SYMBOL(dev_queue_xmit_accel);
-/*=======================================================================
- Receiver routines
- =======================================================================*/
+/*************************************************************************
+ * Receiver routines
+ *************************************************************************/
int netdev_max_backlog __read_mostly = 1000;
EXPORT_SYMBOL(netdev_max_backlog);
int netdev_tstamp_prequeue __read_mostly = 1;
int netdev_budget __read_mostly = 300;
-int weight_p __read_mostly = 64; /* old backlog weight */
+int weight_p __read_mostly = 64; /* old backlog weight */
+int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
+int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
+int dev_rx_weight __read_mostly = 64;
+int dev_tx_weight __read_mostly = 64;
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
@@ -3461,6 +3465,8 @@ EXPORT_SYMBOL(rps_cpu_mask);
struct static_key rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
+struct static_key rfs_needed __read_mostly;
+EXPORT_SYMBOL(rfs_needed);
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
@@ -3796,6 +3802,7 @@ static int netif_rx_internal(struct sk_buff *skb)
#endif
{
unsigned int qtail;
+
ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
put_cpu();
}
@@ -3855,6 +3862,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
while (clist) {
struct sk_buff *skb = clist;
+
clist = clist->next;
WARN_ON(atomic_read(&skb->users));
@@ -3928,7 +3936,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
}
qdisc_skb_cb(skb)->pkt_len = skb->len;
- skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
+ skb->tc_at_ingress = 1;
qdisc_bstats_cpu_update(cl->q, skb);
switch (tc_classify(skb, cl, &cl_res, false)) {
@@ -3993,9 +4001,7 @@ int netdev_rx_handler_register(struct net_device *dev,
rx_handler_func_t *rx_handler,
void *rx_handler_data)
{
- ASSERT_RTNL();
-
- if (dev->rx_handler)
+ if (netdev_is_rx_handler_busy(dev))
return -EBUSY;
/* Note: rx_handler_data must be set before rx_handler */
@@ -4101,12 +4107,8 @@ another_round:
goto out;
}
-#ifdef CONFIG_NET_CLS_ACT
- if (skb->tc_verd & TC_NCLS) {
- skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
- goto ncls;
- }
-#endif
+ if (skb_skip_tc_classify(skb))
+ goto skip_classify;
if (pfmemalloc)
goto skip_taps;
@@ -4134,10 +4136,8 @@ skip_taps:
goto out;
}
#endif
-#ifdef CONFIG_NET_CLS_ACT
- skb->tc_verd = 0;
-ncls:
-#endif
+ skb_reset_tc(skb);
+skip_classify:
if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
goto drop;
@@ -4453,7 +4453,9 @@ static void skb_gro_reset_offset(struct sk_buff *skb)
pinfo->nr_frags &&
!PageHighMem(skb_frag_page(frag0))) {
NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
- NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
+ NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
+ skb_frag_size(frag0),
+ skb->end - skb->tail);
}
}
@@ -4491,7 +4493,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (!(skb->dev->features & NETIF_F_GRO))
goto normal;
- if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
+ if (skb->csum_bad)
goto normal;
gro_list_prepare(napi, skb);
@@ -4504,7 +4506,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
skb_set_network_header(skb, skb_gro_offset(skb));
skb_reset_mac_len(skb);
NAPI_GRO_CB(skb)->same_flow = 0;
- NAPI_GRO_CB(skb)->flush = 0;
+ NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
NAPI_GRO_CB(skb)->free = 0;
NAPI_GRO_CB(skb)->encap_mark = 0;
NAPI_GRO_CB(skb)->recursion_counter = 0;
@@ -4536,6 +4538,11 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (&ptype->list == head)
goto normal;
+ if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
+ ret = GRO_CONSUMED;
+ goto ok;
+ }
+
same_flow = NAPI_GRO_CB(skb)->same_flow;
ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
@@ -4631,6 +4638,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
case GRO_MERGED_FREE:
if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
skb_dst_drop(skb);
+ secpath_reset(skb);
kmem_cache_free(skbuff_head_cache, skb);
} else {
__kfree_skb(skb);
@@ -4639,6 +4647,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
case GRO_HELD:
case GRO_MERGED:
+ case GRO_CONSUMED:
break;
}
@@ -4671,6 +4680,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
skb->encapsulation = 0;
skb_shinfo(skb)->gso_type = 0;
skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+ secpath_reset(skb);
napi->skb = skb;
}
@@ -4709,6 +4719,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
break;
case GRO_MERGED:
+ case GRO_CONSUMED:
break;
}
@@ -4845,7 +4856,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
net_rps_action_and_irq_enable(sd);
}
- napi->weight = weight_p;
+ napi->weight = dev_rx_weight;
while (again) {
struct sk_buff *skb;
@@ -4901,6 +4912,39 @@ void __napi_schedule(struct napi_struct *n)
EXPORT_SYMBOL(__napi_schedule);
/**
+ * napi_schedule_prep - check if napi can be scheduled
+ * @n: napi context
+ *
+ * Test if NAPI routine is already running, and if not mark
+ * it as running. This is used as a condition variable
+ * insure only one NAPI poll instance runs. We also make
+ * sure there is no pending NAPI disable.
+ */
+bool napi_schedule_prep(struct napi_struct *n)
+{
+ unsigned long val, new;
+
+ do {
+ val = READ_ONCE(n->state);
+ if (unlikely(val & NAPIF_STATE_DISABLE))
+ return false;
+ new = val | NAPIF_STATE_SCHED;
+
+ /* Sets STATE_MISSED bit if STATE_SCHED was already set
+ * This was suggested by Alexander Duyck, as compiler
+ * emits better code than :
+ * if (val & NAPIF_STATE_SCHED)
+ * new |= NAPIF_STATE_MISSED;
+ */
+ new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
+ NAPIF_STATE_MISSED;
+ } while (cmpxchg(&n->state, val, new) != val);
+
+ return !(val & NAPIF_STATE_SCHED);
+}
+EXPORT_SYMBOL(napi_schedule_prep);
+
+/**
* __napi_schedule_irqoff - schedule for receive
* @n: entry to schedule
*
@@ -4912,26 +4956,19 @@ void __napi_schedule_irqoff(struct napi_struct *n)
}
EXPORT_SYMBOL(__napi_schedule_irqoff);
-void __napi_complete(struct napi_struct *n)
-{
- BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
-
- list_del_init(&n->poll_list);
- smp_mb__before_atomic();
- clear_bit(NAPI_STATE_SCHED, &n->state);
-}
-EXPORT_SYMBOL(__napi_complete);
-
-void napi_complete_done(struct napi_struct *n, int work_done)
+bool napi_complete_done(struct napi_struct *n, int work_done)
{
- unsigned long flags;
+ unsigned long flags, val, new;
/*
- * don't let napi dequeue from the cpu poll list
- * just in case its running on a different cpu
+ * 1) Don't let napi dequeue from the cpu poll list
+ * just in case its running on a different cpu.
+ * 2) If we are busy polling, do nothing here, we have
+ * the guarantee we will be called later.
*/
- if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
- return;
+ if (unlikely(n->state & (NAPIF_STATE_NPSVC |
+ NAPIF_STATE_IN_BUSY_POLL)))
+ return false;
if (n->gro_list) {
unsigned long timeout = 0;
@@ -4945,14 +4982,34 @@ void napi_complete_done(struct napi_struct *n, int work_done)
else
napi_gro_flush(n, false);
}
- if (likely(list_empty(&n->poll_list))) {
- WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
- } else {
+ if (unlikely(!list_empty(&n->poll_list))) {
/* If n->poll_list is not empty, we need to mask irqs */
local_irq_save(flags);
- __napi_complete(n);
+ list_del_init(&n->poll_list);
local_irq_restore(flags);
}
+
+ do {
+ val = READ_ONCE(n->state);
+
+ WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
+
+ new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
+
+ /* If STATE_MISSED was set, leave STATE_SCHED set,
+ * because we will call napi->poll() one more time.
+ * This C code was suggested by Alexander Duyck to help gcc.
+ */
+ new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
+ NAPIF_STATE_SCHED;
+ } while (cmpxchg(&n->state, val, new) != val);
+
+ if (unlikely(val & NAPIF_STATE_MISSED)) {
+ __napi_schedule(n);
+ return false;
+ }
+
+ return true;
}
EXPORT_SYMBOL(napi_complete_done);
@@ -4970,13 +5027,50 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
}
#if defined(CONFIG_NET_RX_BUSY_POLL)
+
#define BUSY_POLL_BUDGET 8
+
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
+{
+ int rc;
+
+ /* Busy polling means there is a high chance device driver hard irq
+ * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
+ * set in napi_schedule_prep().
+ * Since we are about to call napi->poll() once more, we can safely
+ * clear NAPI_STATE_MISSED.
+ *
+ * Note: x86 could use a single "lock and ..." instruction
+ * to perform these two clear_bit()
+ */
+ clear_bit(NAPI_STATE_MISSED, &napi->state);
+ clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
+
+ local_bh_disable();
+
+ /* All we really want here is to re-enable device interrupts.
+ * Ideally, a new ndo_busy_poll_stop() could avoid another round.
+ */
+ rc = napi->poll(napi, BUSY_POLL_BUDGET);
+ netpoll_poll_unlock(have_poll_lock);
+ if (rc == BUSY_POLL_BUDGET)
+ __napi_schedule(napi);
+ local_bh_enable();
+ if (local_softirq_pending())
+ do_softirq();
+}
+
bool sk_busy_loop(struct sock *sk, int nonblock)
{
unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
- int (*busy_poll)(struct napi_struct *dev);
+ int (*napi_poll)(struct napi_struct *napi, int budget);
+ void *have_poll_lock = NULL;
struct napi_struct *napi;
- int rc = false;
+ int rc;
+
+restart:
+ rc = false;
+ napi_poll = NULL;
rcu_read_lock();
@@ -4984,39 +5078,54 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
if (!napi)
goto out;
- /* Note: ndo_busy_poll method is optional in linux-4.5 */
- busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
-
- do {
+ preempt_disable();
+ for (;;) {
rc = 0;
local_bh_disable();
- if (busy_poll) {
- rc = busy_poll(napi);
- } else if (napi_schedule_prep(napi)) {
- void *have = netpoll_poll_lock(napi);
-
- if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
- rc = napi->poll(napi, BUSY_POLL_BUDGET);
- trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
- if (rc == BUSY_POLL_BUDGET) {
- napi_complete_done(napi, rc);
- napi_schedule(napi);
- }
- }
- netpoll_poll_unlock(have);
+ if (!napi_poll) {
+ unsigned long val = READ_ONCE(napi->state);
+
+ /* If multiple threads are competing for this napi,
+ * we avoid dirtying napi->state as much as we can.
+ */
+ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
+ NAPIF_STATE_IN_BUSY_POLL))
+ goto count;
+ if (cmpxchg(&napi->state, val,
+ val | NAPIF_STATE_IN_BUSY_POLL |
+ NAPIF_STATE_SCHED) != val)
+ goto count;
+ have_poll_lock = netpoll_poll_lock(napi);
+ napi_poll = napi->poll;
}
+ rc = napi_poll(napi, BUSY_POLL_BUDGET);
+ trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
+count:
if (rc > 0)
__NET_ADD_STATS(sock_net(sk),
LINUX_MIB_BUSYPOLLRXPACKETS, rc);
local_bh_enable();
- if (rc == LL_FLUSH_FAILED)
- break; /* permanent failure */
+ if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
+ busy_loop_timeout(end_time))
+ break;
+ if (unlikely(need_resched())) {
+ if (napi_poll)
+ busy_poll_stop(napi, have_poll_lock);
+ preempt_enable();
+ rcu_read_unlock();
+ cond_resched();
+ rc = !skb_queue_empty(&sk->sk_receive_queue);
+ if (rc || busy_loop_timeout(end_time))
+ return rc;
+ goto restart;
+ }
cpu_relax();
- } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
- !need_resched() && !busy_loop_timeout(end_time));
-
+ }
+ if (napi_poll)
+ busy_poll_stop(napi, have_poll_lock);
+ preempt_enable();
rc = !skb_queue_empty(&sk->sk_receive_queue);
out:
rcu_read_unlock();
@@ -5026,7 +5135,7 @@ EXPORT_SYMBOL(sk_busy_loop);
#endif /* CONFIG_NET_RX_BUSY_POLL */
-void napi_hash_add(struct napi_struct *napi)
+static void napi_hash_add(struct napi_struct *napi)
{
if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
@@ -5046,7 +5155,6 @@ void napi_hash_add(struct napi_struct *napi)
spin_unlock(&napi_hash_lock);
}
-EXPORT_SYMBOL_GPL(napi_hash_add);
/* Warning : caller is responsible to make sure rcu grace period
* is respected before freeing memory containing @napi
@@ -5071,8 +5179,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
struct napi_struct *napi;
napi = container_of(timer, struct napi_struct, timer);
- if (napi->gro_list)
- napi_schedule(napi);
+
+ /* Note : we use a relaxed variant of napi_schedule_prep() not setting
+ * NAPI_STATE_MISSED, since we do not react to a device IRQ.
+ */
+ if (napi->gro_list && !napi_disable_pending(napi) &&
+ !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
+ __napi_schedule_irqoff(napi);
return HRTIMER_NORESTART;
}
@@ -5094,7 +5207,6 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
list_add(&napi->dev_list, &dev->napi_list);
napi->dev = dev;
#ifdef CONFIG_NETPOLL
- spin_lock_init(&napi->poll_lock);
napi->poll_owner = -1;
#endif
set_bit(NAPI_STATE_SCHED, &napi->state);
@@ -5212,7 +5324,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
if (list_empty(&list)) {
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
- return;
+ goto out;
break;
}
@@ -5230,7 +5342,6 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
}
}
- __kfree_skb_flush();
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
@@ -5240,6 +5351,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
+out:
+ __kfree_skb_flush();
}
struct netdev_adjacent {
@@ -5270,6 +5383,13 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
return NULL;
}
+static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
+{
+ struct net_device *dev = data;
+
+ return upper_dev == dev;
+}
+
/**
* netdev_has_upper_dev - Check if device is linked to an upper device
* @dev: device
@@ -5284,11 +5404,30 @@ bool netdev_has_upper_dev(struct net_device *dev,
{
ASSERT_RTNL();
- return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
+ return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
+ upper_dev);
}
EXPORT_SYMBOL(netdev_has_upper_dev);
/**
+ * netdev_has_upper_dev_all - Check if device is linked to an upper device
+ * @dev: device
+ * @upper_dev: upper device to check
+ *
+ * Find out if a device is linked to specified upper device and return true
+ * in case it is. Note that this checks the entire upper device chain.
+ * The caller must hold rcu lock.
+ */
+
+bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
+ upper_dev);
+}
+EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
+
+/**
* netdev_has_any_upper_dev - Check if device is linked to some device
* @dev: device
*
@@ -5299,7 +5438,7 @@ static bool netdev_has_any_upper_dev(struct net_device *dev)
{
ASSERT_RTNL();
- return !list_empty(&dev->all_adj_list.upper);
+ return !list_empty(&dev->adj_list.upper);
}
/**
@@ -5326,6 +5465,20 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
}
EXPORT_SYMBOL(netdev_master_upper_dev_get);
+/**
+ * netdev_has_any_lower_dev - Check if device is linked to some device
+ * @dev: device
+ *
+ * Find out if a device is linked to a lower device and return true in case
+ * it is. The caller must hold the RTNL lock.
+ */
+static bool netdev_has_any_lower_dev(struct net_device *dev)
+{
+ ASSERT_RTNL();
+
+ return !list_empty(&dev->adj_list.lower);
+}
+
void *netdev_adjacent_get_private(struct list_head *adj_list)
{
struct netdev_adjacent *adj;
@@ -5362,16 +5515,8 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
}
EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
-/**
- * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
- * @dev: device
- * @iter: list_head ** of the current position
- *
- * Gets the next device from the dev's upper list, starting from iter
- * position. The caller must hold RCU read lock.
- */
-struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
- struct list_head **iter)
+static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
{
struct netdev_adjacent *upper;
@@ -5379,14 +5524,41 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
- if (&upper->list == &dev->all_adj_list.upper)
+ if (&upper->list == &dev->adj_list.upper)
return NULL;
*iter = &upper->list;
return upper->dev;
}
-EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
+
+int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *udev;
+ struct list_head *iter;
+ int ret;
+
+ for (iter = &dev->adj_list.upper,
+ udev = netdev_next_upper_dev_rcu(dev, &iter);
+ udev;
+ udev = netdev_next_upper_dev_rcu(dev, &iter)) {
+ /* first is the upper device itself */
+ ret = fn(udev, data);
+ if (ret)
+ return ret;
+
+ /* then look at all of its upper devices */
+ ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
/**
* netdev_lower_get_next_private - Get the next ->private from the
@@ -5469,55 +5641,90 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
}
EXPORT_SYMBOL(netdev_lower_get_next);
-/**
- * netdev_all_lower_get_next - Get the next device from all lower neighbour list
- * @dev: device
- * @iter: list_head ** of the current position
- *
- * Gets the next netdev_adjacent from the dev's all lower neighbour
- * list, starting from iter position. The caller must hold RTNL lock or
- * its own locking that guarantees that the neighbour all lower
- * list will remain unchanged.
- */
-struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter)
+static struct net_device *netdev_next_lower_dev(struct net_device *dev,
+ struct list_head **iter)
{
struct netdev_adjacent *lower;
- lower = list_entry(*iter, struct netdev_adjacent, list);
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
- if (&lower->list == &dev->all_adj_list.lower)
+ if (&lower->list == &dev->adj_list.lower)
return NULL;
- *iter = lower->list.next;
+ *iter = &lower->list;
return lower->dev;
}
-EXPORT_SYMBOL(netdev_all_lower_get_next);
-/**
- * netdev_all_lower_get_next_rcu - Get the next device from all
- * lower neighbour list, RCU variant
- * @dev: device
- * @iter: list_head ** of the current position
- *
- * Gets the next netdev_adjacent from the dev's all lower neighbour
- * list, starting from iter position. The caller must hold RCU read lock.
- */
-struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
- struct list_head **iter)
+int netdev_walk_all_lower_dev(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *ldev;
+ struct list_head *iter;
+ int ret;
+
+ for (iter = &dev->adj_list.lower,
+ ldev = netdev_next_lower_dev(dev, &iter);
+ ldev;
+ ldev = netdev_next_lower_dev(dev, &iter)) {
+ /* first is the lower device itself */
+ ret = fn(ldev, data);
+ if (ret)
+ return ret;
+
+ /* then look at all of its lower devices */
+ ret = netdev_walk_all_lower_dev(ldev, fn, data);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
+
+static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
{
struct netdev_adjacent *lower;
lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
-
- if (&lower->list == &dev->all_adj_list.lower)
+ if (&lower->list == &dev->adj_list.lower)
return NULL;
*iter = &lower->list;
return lower->dev;
}
-EXPORT_SYMBOL(netdev_all_lower_get_next_rcu);
+
+int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *ldev;
+ struct list_head *iter;
+ int ret;
+
+ for (iter = &dev->adj_list.lower,
+ ldev = netdev_next_lower_dev_rcu(dev, &iter);
+ ldev;
+ ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
+ /* first is the lower device itself */
+ ret = fn(ldev, data);
+ if (ret)
+ return ret;
+
+ /* then look at all of its lower devices */
+ ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
/**
* netdev_lower_get_first_private_rcu - Get the first ->private from the
@@ -5564,6 +5771,7 @@ static int netdev_adjacent_sysfs_add(struct net_device *dev,
struct list_head *dev_list)
{
char linkname[IFNAMSIZ+7];
+
sprintf(linkname, dev_list == &dev->adj_list.upper ?
"upper_%s" : "lower_%s", adj_dev->name);
return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
@@ -5574,6 +5782,7 @@ static void netdev_adjacent_sysfs_del(struct net_device *dev,
struct list_head *dev_list)
{
char linkname[IFNAMSIZ+7];
+
sprintf(linkname, dev_list == &dev->adj_list.upper ?
"upper_%s" : "lower_%s", name);
sysfs_remove_link(&(dev->dev.kobj), linkname);
@@ -5590,7 +5799,6 @@ static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
static int __netdev_adjacent_dev_insert(struct net_device *dev,
struct net_device *adj_dev,
- u16 ref_nr,
struct list_head *dev_list,
void *private, bool master)
{
@@ -5600,7 +5808,10 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
adj = __netdev_find_adj(adj_dev, dev_list);
if (adj) {
- adj->ref_nr += ref_nr;
+ adj->ref_nr += 1;
+ pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
+ dev->name, adj_dev->name, adj->ref_nr);
+
return 0;
}
@@ -5610,12 +5821,12 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
adj->dev = adj_dev;
adj->master = master;
- adj->ref_nr = ref_nr;
+ adj->ref_nr = 1;
adj->private = private;
dev_hold(adj_dev);
- pr_debug("dev_hold for %s, because of link added from %s to %s\n",
- adj_dev->name, dev->name, adj_dev->name);
+ pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
+ dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
@@ -5654,17 +5865,22 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
{
struct netdev_adjacent *adj;
+ pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
+ dev->name, adj_dev->name, ref_nr);
+
adj = __netdev_find_adj(adj_dev, dev_list);
if (!adj) {
- pr_err("tried to remove device %s from %s\n",
+ pr_err("Adjacency does not exist for device %s from %s\n",
dev->name, adj_dev->name);
- BUG();
+ WARN_ON(1);
+ return;
}
if (adj->ref_nr > ref_nr) {
- pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
- ref_nr, adj->ref_nr-ref_nr);
+ pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
+ dev->name, adj_dev->name, ref_nr,
+ adj->ref_nr - ref_nr);
adj->ref_nr -= ref_nr;
return;
}
@@ -5676,7 +5892,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
list_del_rcu(&adj->list);
- pr_debug("dev_put for %s, because link removed from %s to %s\n",
+ pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
adj_dev->name, dev->name, adj_dev->name);
dev_put(adj_dev);
kfree_rcu(adj, rcu);
@@ -5684,38 +5900,27 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
struct net_device *upper_dev,
- u16 ref_nr,
struct list_head *up_list,
struct list_head *down_list,
void *private, bool master)
{
int ret;
- ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
+ ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
private, master);
if (ret)
return ret;
- ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
+ ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
private, false);
if (ret) {
- __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
+ __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
return ret;
}
return 0;
}
-static int __netdev_adjacent_dev_link(struct net_device *dev,
- struct net_device *upper_dev,
- u16 ref_nr)
-{
- return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
- &dev->all_adj_list.upper,
- &upper_dev->all_adj_list.lower,
- NULL, false);
-}
-
static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
struct net_device *upper_dev,
u16 ref_nr,
@@ -5726,40 +5931,19 @@ static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
}
-static void __netdev_adjacent_dev_unlink(struct net_device *dev,
- struct net_device *upper_dev,
- u16 ref_nr)
-{
- __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
- &dev->all_adj_list.upper,
- &upper_dev->all_adj_list.lower);
-}
-
static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
struct net_device *upper_dev,
void *private, bool master)
{
- int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
-
- if (ret)
- return ret;
-
- ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
- &dev->adj_list.upper,
- &upper_dev->adj_list.lower,
- private, master);
- if (ret) {
- __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
- return ret;
- }
-
- return 0;
+ return __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ &dev->adj_list.upper,
+ &upper_dev->adj_list.lower,
+ private, master);
}
static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
struct net_device *upper_dev)
{
- __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
&dev->adj_list.upper,
&upper_dev->adj_list.lower);
@@ -5770,7 +5954,6 @@ static int __netdev_upper_dev_link(struct net_device *dev,
void *upper_priv, void *upper_info)
{
struct netdev_notifier_changeupper_info changeupper_info;
- struct netdev_adjacent *i, *j, *to_i, *to_j;
int ret = 0;
ASSERT_RTNL();
@@ -5779,10 +5962,10 @@ static int __netdev_upper_dev_link(struct net_device *dev,
return -EBUSY;
/* To prevent loops, check if dev is not upper device to upper_dev. */
- if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
+ if (netdev_has_upper_dev(upper_dev, dev))
return -EBUSY;
- if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
+ if (netdev_has_upper_dev(dev, upper_dev))
return -EEXIST;
if (master && netdev_master_upper_dev_get(dev))
@@ -5804,80 +5987,15 @@ static int __netdev_upper_dev_link(struct net_device *dev,
if (ret)
return ret;
- /* Now that we linked these devs, make all the upper_dev's
- * all_adj_list.upper visible to every dev's all_adj_list.lower an
- * versa, and don't forget the devices itself. All of these
- * links are non-neighbours.
- */
- list_for_each_entry(i, &dev->all_adj_list.lower, list) {
- list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
- pr_debug("Interlinking %s with %s, non-neighbour\n",
- i->dev->name, j->dev->name);
- ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
- if (ret)
- goto rollback_mesh;
- }
- }
-
- /* add dev to every upper_dev's upper device */
- list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
- pr_debug("linking %s's upper device %s with %s\n",
- upper_dev->name, i->dev->name, dev->name);
- ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
- if (ret)
- goto rollback_upper_mesh;
- }
-
- /* add upper_dev to every dev's lower device */
- list_for_each_entry(i, &dev->all_adj_list.lower, list) {
- pr_debug("linking %s's lower device %s with %s\n", dev->name,
- i->dev->name, upper_dev->name);
- ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
- if (ret)
- goto rollback_lower_mesh;
- }
-
ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
&changeupper_info.info);
ret = notifier_to_errno(ret);
if (ret)
- goto rollback_lower_mesh;
+ goto rollback;
return 0;
-rollback_lower_mesh:
- to_i = i;
- list_for_each_entry(i, &dev->all_adj_list.lower, list) {
- if (i == to_i)
- break;
- __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
- }
-
- i = NULL;
-
-rollback_upper_mesh:
- to_i = i;
- list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
- if (i == to_i)
- break;
- __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
- }
-
- i = j = NULL;
-
-rollback_mesh:
- to_i = i;
- to_j = j;
- list_for_each_entry(i, &dev->all_adj_list.lower, list) {
- list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
- if (i == to_i && j == to_j)
- break;
- __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
- }
- if (i == to_i)
- break;
- }
-
+rollback:
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
return ret;
@@ -5934,7 +6052,7 @@ void netdev_upper_dev_unlink(struct net_device *dev,
struct net_device *upper_dev)
{
struct netdev_notifier_changeupper_info changeupper_info;
- struct netdev_adjacent *i, *j;
+
ASSERT_RTNL();
changeupper_info.upper_dev = upper_dev;
@@ -5946,23 +6064,6 @@ void netdev_upper_dev_unlink(struct net_device *dev,
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
- /* Here is the tricky part. We must remove all dev's lower
- * devices from all upper_dev's upper devices and vice
- * versa, to maintain the graph relationship.
- */
- list_for_each_entry(i, &dev->all_adj_list.lower, list)
- list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
- __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
-
- /* remove also the devices itself from lower/upper device
- * list
- */
- list_for_each_entry(i, &dev->all_adj_list.lower, list)
- __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
-
- list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
- __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
-
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
&changeupper_info.info);
}
@@ -6118,50 +6219,6 @@ void netdev_lower_state_changed(struct net_device *lower_dev,
}
EXPORT_SYMBOL(netdev_lower_state_changed);
-int netdev_default_l2upper_neigh_construct(struct net_device *dev,
- struct neighbour *n)
-{
- struct net_device *lower_dev, *stop_dev;
- struct list_head *iter;
- int err;
-
- netdev_for_each_lower_dev(dev, lower_dev, iter) {
- if (!lower_dev->netdev_ops->ndo_neigh_construct)
- continue;
- err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
- if (err) {
- stop_dev = lower_dev;
- goto rollback;
- }
- }
- return 0;
-
-rollback:
- netdev_for_each_lower_dev(dev, lower_dev, iter) {
- if (lower_dev == stop_dev)
- break;
- if (!lower_dev->netdev_ops->ndo_neigh_destroy)
- continue;
- lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
- }
- return err;
-}
-EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
-
-void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
- struct neighbour *n)
-{
- struct net_device *lower_dev;
- struct list_head *iter;
-
- netdev_for_each_lower_dev(dev, lower_dev, iter) {
- if (!lower_dev->netdev_ops->ndo_neigh_destroy)
- continue;
- lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
- }
-}
-EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
-
static void dev_change_rx_flags(struct net_device *dev, int flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -6414,8 +6471,8 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
}
/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
- is important. Some (broken) drivers set IFF_PROMISC, when
- IFF_ALLMULTI is requested not asking us and not reporting.
+ * is important. Some (broken) drivers set IFF_PROMISC, when
+ * IFF_ALLMULTI is requested not asking us and not reporting.
*/
if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
@@ -6500,9 +6557,18 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
if (new_mtu == dev->mtu)
return 0;
- /* MTU must be positive. */
- if (new_mtu < 0)
+ /* MTU must be positive, and in range */
+ if (new_mtu < 0 || new_mtu < dev->min_mtu) {
+ net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
+ dev->name, new_mtu, dev->min_mtu);
+ return -EINVAL;
+ }
+
+ if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
+ net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
+ dev->name, new_mtu, dev->max_mtu);
return -EINVAL;
+ }
if (!netif_device_present(dev))
return -ENODEV;
@@ -6649,33 +6715,48 @@ EXPORT_SYMBOL(dev_change_proto_down);
* dev_change_xdp_fd - set or clear a bpf program for a device rx path
* @dev: device
* @fd: new program fd or negative value to clear
+ * @flags: xdp-related flags
*
* Set or clear a bpf program for a device
*/
-int dev_change_xdp_fd(struct net_device *dev, int fd)
+int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
struct bpf_prog *prog = NULL;
- struct netdev_xdp xdp = {};
+ struct netdev_xdp xdp;
int err;
+ ASSERT_RTNL();
+
if (!ops->ndo_xdp)
return -EOPNOTSUPP;
if (fd >= 0) {
+ if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_QUERY_PROG;
+
+ err = ops->ndo_xdp(dev, &xdp);
+ if (err < 0)
+ return err;
+ if (xdp.prog_attached)
+ return -EBUSY;
+ }
+
prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
if (IS_ERR(prog))
return PTR_ERR(prog);
}
+ memset(&xdp, 0, sizeof(xdp));
xdp.command = XDP_SETUP_PROG;
xdp.prog = prog;
+
err = ops->ndo_xdp(dev, &xdp);
if (err < 0 && prog)
bpf_prog_put(prog);
return err;
}
-EXPORT_SYMBOL(dev_change_xdp_fd);
/**
* dev_new_index - allocate an ifindex
@@ -6688,6 +6769,7 @@ EXPORT_SYMBOL(dev_change_xdp_fd);
static int dev_new_index(struct net *net)
{
int ifindex = net->ifindex;
+
for (;;) {
if (++ifindex <= 0)
ifindex = 1;
@@ -6754,8 +6836,8 @@ static void rollback_registered_many(struct list_head *head)
/* Notify protocols, that we are about to destroy
- this device. They should clean all the things.
- */
+ * this device. They should clean all the things.
+ */
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
if (!dev->rtnl_link_ops ||
@@ -6777,6 +6859,7 @@ static void rollback_registered_many(struct list_head *head)
/* Notifier chain MUST detach us all upper devices. */
WARN_ON(netdev_has_any_upper_dev(dev));
+ WARN_ON(netdev_has_any_lower_dev(dev));
/* Remove entries from kobject tree */
netdev_unregister_kobject(dev);
@@ -6912,13 +6995,6 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
features &= ~dev->gso_partial_features;
}
-#ifdef CONFIG_NET_RX_BUSY_POLL
- if (dev->netdev_ops->ndo_busy_poll)
- features |= NETIF_F_BUSY_POLL;
- else
-#endif
- features &= ~NETIF_F_BUSY_POLL;
-
return features;
}
@@ -7107,6 +7183,7 @@ void netif_tx_stop_all_queues(struct net_device *dev)
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+
netif_tx_stop_queue(txq);
}
}
@@ -7581,17 +7658,17 @@ void netdev_freemem(struct net_device *dev)
}
/**
- * alloc_netdev_mqs - allocate network device
- * @sizeof_priv: size of private data to allocate space for
- * @name: device name format string
- * @name_assign_type: origin of device name
- * @setup: callback to initialize device
- * @txqs: the number of TX subqueues to allocate
- * @rxqs: the number of RX subqueues to allocate
- *
- * Allocates a struct net_device with private data area for driver use
- * and performs basic initialization. Also allocates subqueue structs
- * for each queue on the device.
+ * alloc_netdev_mqs - allocate network device
+ * @sizeof_priv: size of private data to allocate space for
+ * @name: device name format string
+ * @name_assign_type: origin of device name
+ * @setup: callback to initialize device
+ * @txqs: the number of TX subqueues to allocate
+ * @rxqs: the number of RX subqueues to allocate
+ *
+ * Allocates a struct net_device with private data area for driver use
+ * and performs basic initialization. Also allocates subqueue structs
+ * for each queue on the device.
*/
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
unsigned char name_assign_type,
@@ -7655,8 +7732,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
INIT_LIST_HEAD(&dev->link_watch_list);
INIT_LIST_HEAD(&dev->adj_list.upper);
INIT_LIST_HEAD(&dev->adj_list.lower);
- INIT_LIST_HEAD(&dev->all_adj_list.upper);
- INIT_LIST_HEAD(&dev->all_adj_list.lower);
INIT_LIST_HEAD(&dev->ptype_all);
INIT_LIST_HEAD(&dev->ptype_specific);
#ifdef CONFIG_NET_SCHED
@@ -7667,7 +7742,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
if (!dev->tx_queue_len) {
dev->priv_flags |= IFF_NO_QUEUE;
- dev->tx_queue_len = 1;
+ dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
}
dev->num_tx_queues = txqs;
@@ -7705,13 +7780,13 @@ free_dev:
EXPORT_SYMBOL(alloc_netdev_mqs);
/**
- * free_netdev - free network device
- * @dev: device
+ * free_netdev - free network device
+ * @dev: device
*
- * This function does the last stage of destroying an allocated device
- * interface. The reference to the device object is released.
- * If this is the last reference then it will be freed.
- * Must be called in process context.
+ * This function does the last stage of destroying an allocated device
+ * interface. The reference to the device object is released. If this
+ * is the last reference then it will be freed.Must be called in process
+ * context.
*/
void free_netdev(struct net_device *dev)
{
@@ -7893,12 +7968,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
dev_shutdown(dev);
/* Notify protocols, that we are about to destroy
- this device. They should clean all the things.
-
- Note that dev->reg_state stays at NETREG_REGISTERED.
- This is wanted because this way 8021q and macvlan know
- the device is just moving and can keep their slaves up.
- */
+ * this device. They should clean all the things.
+ *
+ * Note that dev->reg_state stays at NETREG_REGISTERED.
+ * This is wanted because this way 8021q and macvlan know
+ * the device is just moving and can keep their slaves up.
+ */
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
rcu_barrier();
call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
@@ -7948,18 +8023,13 @@ out:
}
EXPORT_SYMBOL_GPL(dev_change_net_namespace);
-static int dev_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *ocpu)
+static int dev_cpu_dead(unsigned int oldcpu)
{
struct sk_buff **list_skb;
struct sk_buff *skb;
- unsigned int cpu, oldcpu = (unsigned long)ocpu;
+ unsigned int cpu;
struct softnet_data *sd, *oldsd;
- if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
- return NOTIFY_OK;
-
local_irq_disable();
cpu = smp_processor_id();
sd = &per_cpu(softnet_data, cpu);
@@ -8009,10 +8079,9 @@ static int dev_cpu_callback(struct notifier_block *nfb,
input_queue_head_incr(oldsd);
}
- return NOTIFY_OK;
+ return 0;
}
-
/**
* netdev_increment_features - increment feature set by one
* @all: current feature set
@@ -8346,7 +8415,9 @@ static int __init net_dev_init(void)
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
- hotcpu_notifier(dev_cpu_callback, 0);
+ rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
+ NULL, dev_cpu_dead);
+ WARN_ON(rc < 0);
dst_subsys_init();
rc = 0;
out:
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 1b5063088f1a..e9c1e6acfb6d 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -341,15 +341,7 @@ static void devlink_nl_post_doit(const struct genl_ops *ops,
mutex_unlock(&devlink_mutex);
}
-static struct genl_family devlink_nl_family = {
- .id = GENL_ID_GENERATE,
- .name = DEVLINK_GENL_NAME,
- .version = DEVLINK_GENL_VERSION,
- .maxattr = DEVLINK_ATTR_MAX,
- .netnsok = true,
- .pre_doit = devlink_nl_pre_doit,
- .post_doit = devlink_nl_post_doit,
-};
+static struct genl_family devlink_nl_family;
enum devlink_multicast_groups {
DEVLINK_MCGRP_CONFIG,
@@ -608,6 +600,8 @@ static int devlink_port_type_set(struct devlink *devlink,
if (devlink->ops && devlink->ops->port_type_set) {
if (port_type == DEVLINK_PORT_TYPE_NOTSET)
return -EINVAL;
+ if (port_type == devlink_port->type)
+ return 0;
err = devlink->ops->port_type_set(devlink_port, port_type);
if (err)
return err;
@@ -1398,52 +1392,68 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
return -EOPNOTSUPP;
}
-static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
- enum devlink_command cmd, u32 portid,
- u32 seq, int flags, u16 mode)
+static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
{
+ const struct devlink_ops *ops = devlink->ops;
void *hdr;
+ int err = 0;
+ u16 mode;
+ u8 inline_mode;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
- if (devlink_nl_put_handle(msg, devlink))
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
goto nla_put_failure;
- if (nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode))
- goto nla_put_failure;
+ if (ops->eswitch_mode_get) {
+ err = ops->eswitch_mode_get(devlink, &mode);
+ if (err)
+ goto nla_put_failure;
+ err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (ops->eswitch_inline_mode_get) {
+ err = ops->eswitch_inline_mode_get(devlink, &inline_mode);
+ if (err)
+ goto nla_put_failure;
+ err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE,
+ inline_mode);
+ if (err)
+ goto nla_put_failure;
+ }
genlmsg_end(msg, hdr);
return 0;
nla_put_failure:
genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
+ return err;
}
-static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
- struct genl_info *info)
+static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
const struct devlink_ops *ops = devlink->ops;
struct sk_buff *msg;
- u16 mode;
int err;
- if (!ops || !ops->eswitch_mode_get)
+ if (!ops)
return -EOPNOTSUPP;
- err = ops->eswitch_mode_get(devlink, &mode);
- if (err)
- return err;
-
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
- err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET,
- info->snd_portid, info->snd_seq, 0, mode);
+ err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET,
+ info->snd_portid, info->snd_seq, 0);
if (err) {
nlmsg_free(msg);
@@ -1453,21 +1463,38 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
return genlmsg_reply(msg, info);
}
-static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb,
- struct genl_info *info)
+static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
const struct devlink_ops *ops = devlink->ops;
u16 mode;
+ u8 inline_mode;
+ int err = 0;
- if (!info->attrs[DEVLINK_ATTR_ESWITCH_MODE])
- return -EINVAL;
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) {
+ if (!ops->eswitch_mode_set)
+ return -EOPNOTSUPP;
+ mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+ err = ops->eswitch_mode_set(devlink, mode);
+ if (err)
+ return err;
+ }
- mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) {
+ if (!ops->eswitch_inline_mode_set)
+ return -EOPNOTSUPP;
+ inline_mode = nla_get_u8(
+ info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]);
+ err = ops->eswitch_inline_mode_set(devlink, inline_mode);
+ if (err)
+ return err;
+ }
- if (ops && ops->eswitch_mode_set)
- return ops->eswitch_mode_set(devlink, mode);
- return -EOPNOTSUPP;
+ return 0;
}
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
@@ -1484,6 +1511,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 },
[DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 },
[DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 },
};
static const struct genl_ops devlink_nl_ops[] = {
@@ -1603,21 +1631,35 @@ static const struct genl_ops devlink_nl_ops[] = {
DEVLINK_NL_FLAG_LOCK_PORTS,
},
{
- .cmd = DEVLINK_CMD_ESWITCH_MODE_GET,
- .doit = devlink_nl_cmd_eswitch_mode_get_doit,
+ .cmd = DEVLINK_CMD_ESWITCH_GET,
+ .doit = devlink_nl_cmd_eswitch_get_doit,
.policy = devlink_nl_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
{
- .cmd = DEVLINK_CMD_ESWITCH_MODE_SET,
- .doit = devlink_nl_cmd_eswitch_mode_set_doit,
+ .cmd = DEVLINK_CMD_ESWITCH_SET,
+ .doit = devlink_nl_cmd_eswitch_set_doit,
.policy = devlink_nl_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
};
+static struct genl_family devlink_nl_family __ro_after_init = {
+ .name = DEVLINK_GENL_NAME,
+ .version = DEVLINK_GENL_VERSION,
+ .maxattr = DEVLINK_ATTR_MAX,
+ .netnsok = true,
+ .pre_doit = devlink_nl_pre_doit,
+ .post_doit = devlink_nl_post_doit,
+ .module = THIS_MODULE,
+ .ops = devlink_nl_ops,
+ .n_ops = ARRAY_SIZE(devlink_nl_ops),
+ .mcgrps = devlink_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps),
+};
+
/**
* devlink_alloc - Allocate new devlink instance resources
*
@@ -1840,9 +1882,7 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister);
static int __init devlink_module_init(void)
{
- return genl_register_family_with_ops_groups(&devlink_nl_family,
- devlink_nl_ops,
- devlink_nl_mcgrps);
+ return genl_register_family(&devlink_nl_family);
}
static void __exit devlink_module_exit(void)
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 72cfb0c61125..fb55327dcfea 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -59,12 +59,7 @@ struct dm_hw_stat_delta {
unsigned long last_drop_val;
};
-static struct genl_family net_drop_monitor_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = "NET_DM",
- .version = 2,
-};
+static struct genl_family net_drop_monitor_family;
static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
@@ -80,6 +75,7 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
struct nlattr *nla;
struct sk_buff *skb;
unsigned long flags;
+ void *msg_header;
al = sizeof(struct net_dm_alert_msg);
al += dm_hit_limit * sizeof(struct net_dm_drop_point);
@@ -87,21 +83,41 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
skb = genlmsg_new(al, GFP_KERNEL);
- if (skb) {
- genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
- 0, NET_DM_CMD_ALERT);
- nla = nla_reserve(skb, NLA_UNSPEC,
- sizeof(struct net_dm_alert_msg));
- msg = nla_data(nla);
- memset(msg, 0, al);
- } else {
- mod_timer(&data->send_timer, jiffies + HZ / 10);
+ if (!skb)
+ goto err;
+
+ msg_header = genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
+ 0, NET_DM_CMD_ALERT);
+ if (!msg_header) {
+ nlmsg_free(skb);
+ skb = NULL;
+ goto err;
+ }
+ nla = nla_reserve(skb, NLA_UNSPEC,
+ sizeof(struct net_dm_alert_msg));
+ if (!nla) {
+ nlmsg_free(skb);
+ skb = NULL;
+ goto err;
}
+ msg = nla_data(nla);
+ memset(msg, 0, al);
+ goto out;
+err:
+ mod_timer(&data->send_timer, jiffies + HZ / 10);
+out:
spin_lock_irqsave(&data->lock, flags);
swap(data->skb, skb);
spin_unlock_irqrestore(&data->lock, flags);
+ if (skb) {
+ struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data;
+ struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlh);
+
+ genlmsg_end(skb, genlmsg_data(gnlh));
+ }
+
return skb;
}
@@ -351,6 +367,17 @@ static const struct genl_ops dropmon_ops[] = {
},
};
+static struct genl_family net_drop_monitor_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = "NET_DM",
+ .version = 2,
+ .module = THIS_MODULE,
+ .ops = dropmon_ops,
+ .n_ops = ARRAY_SIZE(dropmon_ops),
+ .mcgrps = dropmon_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps),
+};
+
static struct notifier_block dropmon_net_notifier = {
.notifier_call = dropmon_net_event
};
@@ -367,8 +394,7 @@ static int __init init_net_drop_monitor(void)
return -ENOSPC;
}
- rc = genl_register_family_with_ops_groups(&net_drop_monitor_family,
- dropmon_ops, dropmon_mcgrps);
+ rc = genl_register_family(&net_drop_monitor_family);
if (rc) {
pr_err("Could not create drop monitor netlink family\n");
return rc;
diff --git a/net/core/dst.c b/net/core/dst.c
index b5cbbe07f786..960e503b5a52 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -190,7 +190,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
dst->__use = 0;
dst->lastuse = jiffies;
dst->flags = flags;
- dst->pending_confirm = 0;
dst->next = NULL;
if (!(flags & DST_NOCOUNT))
dst_entries_add(ops, 1);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 047a1752ece1..aecb2c7241b6 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -24,7 +24,7 @@
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/rtnetlink.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/net.h>
/*
@@ -102,7 +102,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_RXFCS_BIT] = "rx-fcs",
[NETIF_F_RXALL_BIT] = "rx-all",
[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
- [NETIF_F_BUSY_POLL_BIT] = "busy-poll",
[NETIF_F_HW_TC_BIT] = "hw-tc-offload",
};
@@ -119,6 +118,12 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
[ETHTOOL_TX_COPYBREAK] = "tx-copybreak",
};
+static const char
+phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_ID_UNSPEC] = "Unspec",
+ [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift",
+};
+
static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
{
struct ethtool_gfeatures cmd = {
@@ -227,6 +232,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
if (sset == ETH_SS_TUNABLES)
return ARRAY_SIZE(tunable_strings);
+ if (sset == ETH_SS_PHY_TUNABLES)
+ return ARRAY_SIZE(phy_tunable_strings);
+
if (sset == ETH_SS_PHY_STATS) {
if (dev->phydev)
return phy_get_sset_count(dev->phydev);
@@ -253,6 +261,8 @@ static void __ethtool_get_strings(struct net_device *dev,
sizeof(rss_hash_func_strings));
else if (stringset == ETH_SS_TUNABLES)
memcpy(data, tunable_strings, sizeof(tunable_strings));
+ else if (stringset == ETH_SS_PHY_TUNABLES)
+ memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
else if (stringset == ETH_SS_PHY_STATS) {
struct phy_device *phydev = dev->phydev;
@@ -1394,9 +1404,12 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
if (regs.len > reglen)
regs.len = reglen;
- regbuf = vzalloc(reglen);
- if (reglen && !regbuf)
- return -ENOMEM;
+ regbuf = NULL;
+ if (reglen) {
+ regbuf = vzalloc(reglen);
+ if (!regbuf)
+ return -ENOMEM;
+ }
ops->get_regs(dev, &regs, regbuf);
@@ -1701,7 +1714,7 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
void __user *useraddr)
{
- struct ethtool_channels channels, max;
+ struct ethtool_channels channels, max = { .cmd = ETHTOOL_GCHANNELS };
u32 max_rx_in_use = 0;
if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
@@ -1806,11 +1819,13 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
ret = __ethtool_get_sset_count(dev, gstrings.string_set);
if (ret < 0)
return ret;
+ if (ret > S32_MAX / ETH_GSTRING_LEN)
+ return -ENOMEM;
+ WARN_ON_ONCE(!ret);
gstrings.len = ret;
-
- data = kcalloc(gstrings.len, ETH_GSTRING_LEN, GFP_USER);
- if (!data)
+ data = vzalloc(gstrings.len * ETH_GSTRING_LEN);
+ if (gstrings.len && !data)
return -ENOMEM;
__ethtool_get_strings(dev, gstrings.string_set, data);
@@ -1819,12 +1834,13 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
goto out;
useraddr += sizeof(gstrings);
- if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
+ if (gstrings.len &&
+ copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
goto out;
ret = 0;
out:
- kfree(data);
+ vfree(data);
return ret;
}
@@ -1901,14 +1917,15 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
n_stats = ops->get_sset_count(dev, ETH_SS_STATS);
if (n_stats < 0)
return n_stats;
- WARN_ON(n_stats == 0);
-
+ if (n_stats > S32_MAX / sizeof(u64))
+ return -ENOMEM;
+ WARN_ON_ONCE(!n_stats);
if (copy_from_user(&stats, useraddr, sizeof(stats)))
return -EFAULT;
stats.n_stats = n_stats;
- data = kmalloc(n_stats * sizeof(u64), GFP_USER);
- if (!data)
+ data = vzalloc(n_stats * sizeof(u64));
+ if (n_stats && !data)
return -ENOMEM;
ops->get_ethtool_stats(dev, &stats, data);
@@ -1917,12 +1934,12 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
if (copy_to_user(useraddr, &stats, sizeof(stats)))
goto out;
useraddr += sizeof(stats);
- if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
+ if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64)))
goto out;
ret = 0;
out:
- kfree(data);
+ vfree(data);
return ret;
}
@@ -1937,17 +1954,18 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
return -EOPNOTSUPP;
n_stats = phy_get_sset_count(phydev);
-
if (n_stats < 0)
return n_stats;
- WARN_ON(n_stats == 0);
+ if (n_stats > S32_MAX / sizeof(u64))
+ return -ENOMEM;
+ WARN_ON_ONCE(!n_stats);
if (copy_from_user(&stats, useraddr, sizeof(stats)))
return -EFAULT;
stats.n_stats = n_stats;
- data = kmalloc_array(n_stats, sizeof(u64), GFP_USER);
- if (!data)
+ data = vzalloc(n_stats * sizeof(u64));
+ if (n_stats && !data)
return -ENOMEM;
mutex_lock(&phydev->lock);
@@ -1958,12 +1976,12 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
if (copy_to_user(useraddr, &stats, sizeof(stats)))
goto out;
useraddr += sizeof(stats);
- if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
+ if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64)))
goto out;
ret = 0;
out:
- kfree(data);
+ vfree(data);
return ret;
}
@@ -2422,6 +2440,85 @@ static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
};
}
+static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna)
+{
+ switch (tuna->id) {
+ case ETHTOOL_PHY_DOWNSHIFT:
+ if (tuna->len != sizeof(u8) ||
+ tuna->type_id != ETHTOOL_TUNABLE_U8)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int get_phy_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ struct phy_device *phydev = dev->phydev;
+ void *data;
+
+ if (!(phydev && phydev->drv && phydev->drv->get_tunable))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_phy_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kmalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->get_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
+ if (ret)
+ goto out;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, data, tuna.len))
+ goto out;
+ ret = 0;
+
+out:
+ kfree(data);
+ return ret;
+}
+
+static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ struct phy_device *phydev = dev->phydev;
+ void *data;
+
+ if (!(phydev && phydev->drv && phydev->drv->set_tunable))
+ return -EOPNOTSUPP;
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_phy_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kmalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_from_user(data, useraddr, tuna.len))
+ goto out;
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->set_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
+
+out:
+ kfree(data);
+ return ret;
+}
+
/* The main entry point in this file. Called from net/core/dev_ioctl.c */
int dev_ethtool(struct net *net, struct ifreq *ifr)
@@ -2479,6 +2576,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GET_TS_INFO:
case ETHTOOL_GEEE:
case ETHTOOL_GTUNABLE:
+ case ETHTOOL_PHY_GTUNABLE:
case ETHTOOL_GLINKSETTINGS:
break;
default:
@@ -2685,6 +2783,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SLINKSETTINGS:
rc = ethtool_set_link_ksettings(dev, useraddr);
break;
+ case ETHTOOL_PHY_GTUNABLE:
+ rc = get_phy_tunable(dev, useraddr);
+ break;
+ case ETHTOOL_PHY_STUNABLE:
+ rc = set_phy_tunable(dev, useraddr);
+ break;
default:
rc = -EOPNOTSUPP;
}
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index be4629c344a6..b6791d94841d 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -18,6 +18,11 @@
#include <net/fib_rules.h>
#include <net/ip_tunnels.h>
+static const struct fib_kuid_range fib_kuid_range_unset = {
+ KUIDT_INIT(0),
+ KUIDT_INIT(~0),
+};
+
int fib_default_rule_add(struct fib_rules_ops *ops,
u32 pref, u32 table, u32 flags)
{
@@ -33,6 +38,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->table = table;
r->flags = flags;
r->fr_net = ops->fro_net;
+ r->uid_range = fib_kuid_range_unset;
r->suppress_prefixlen = -1;
r->suppress_ifgroup = -1;
@@ -172,6 +178,34 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
}
EXPORT_SYMBOL_GPL(fib_rules_unregister);
+static int uid_range_set(struct fib_kuid_range *range)
+{
+ return uid_valid(range->start) && uid_valid(range->end);
+}
+
+static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb)
+{
+ struct fib_rule_uid_range *in;
+ struct fib_kuid_range out;
+
+ in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]);
+
+ out.start = make_kuid(current_user_ns(), in->start);
+ out.end = make_kuid(current_user_ns(), in->end);
+
+ return out;
+}
+
+static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range)
+{
+ struct fib_rule_uid_range out = {
+ from_kuid_munged(current_user_ns(), range->start),
+ from_kuid_munged(current_user_ns(), range->end)
+ };
+
+ return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out);
+}
+
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
struct flowi *fl, int flags,
struct fib_lookup_arg *arg)
@@ -193,6 +227,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg))
goto out;
+ if (uid_lt(fl->flowi_uid, rule->uid_range.start) ||
+ uid_gt(fl->flowi_uid, rule->uid_range.end))
+ goto out;
+
ret = ops->match(rule, fl, flags);
out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -305,6 +343,10 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
if (r->l3mdev != rule->l3mdev)
continue;
+ if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+ !uid_eq(r->uid_range.end, rule->uid_range.end))
+ continue;
+
if (!ops->compare(r, frh, tb))
continue;
return 1;
@@ -429,6 +471,21 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh)
if (rule->l3mdev && rule->table)
goto errout_free;
+ if (tb[FRA_UID_RANGE]) {
+ if (current_user_ns() != net->user_ns) {
+ err = -EPERM;
+ goto errout_free;
+ }
+
+ rule->uid_range = nla_get_kuid_range(tb);
+
+ if (!uid_range_set(&rule->uid_range) ||
+ !uid_lte(rule->uid_range.start, rule->uid_range.end))
+ goto errout_free;
+ } else {
+ rule->uid_range = fib_kuid_range_unset;
+ }
+
if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
rule_exists(ops, frh, tb, rule)) {
err = -EEXIST;
@@ -497,6 +554,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
struct fib_rules_ops *ops = NULL;
struct fib_rule *rule, *tmp;
struct nlattr *tb[FRA_MAX+1];
+ struct fib_kuid_range range;
int err = -EINVAL;
if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
@@ -516,6 +574,14 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err < 0)
goto errout;
+ if (tb[FRA_UID_RANGE]) {
+ range = nla_get_kuid_range(tb);
+ if (!uid_range_set(&range))
+ goto errout;
+ } else {
+ range = fib_kuid_range_unset;
+ }
+
list_for_each_entry(rule, &ops->rules_list, list) {
if (frh->action && (frh->action != rule->action))
continue;
@@ -552,6 +618,11 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
(rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV])))
continue;
+ if (uid_range_set(&range) &&
+ (!uid_eq(rule->uid_range.start, range.start) ||
+ !uid_eq(rule->uid_range.end, range.end)))
+ continue;
+
if (!ops->compare(rule, frh, tb))
continue;
@@ -619,7 +690,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ nla_total_size(4) /* FRA_FWMARK */
+ nla_total_size(4) /* FRA_FWMASK */
- + nla_total_size_64bit(8); /* FRA_TUN_ID */
+ + nla_total_size_64bit(8) /* FRA_TUN_ID */
+ + nla_total_size(sizeof(struct fib_kuid_range));
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -679,7 +751,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
(rule->tun_id &&
nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) ||
(rule->l3mdev &&
- nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)))
+ nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) ||
+ (uid_range_set(&rule->uid_range) &&
+ nla_put_uid_range(skb, &rule->uid_range)))
goto nla_put_failure;
if (rule->suppress_ifgroup != -1) {
diff --git a/net/core/filter.c b/net/core/filter.c
index b391209838ef..ebaeaf2e46e8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -30,6 +30,7 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
+#include <linux/if_arp.h>
#include <linux/gfp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -39,7 +40,7 @@
#include <net/flow_dissector.h>
#include <linux/errno.h>
#include <linux/timer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unaligned.h>
#include <linux/filter.h>
#include <linux/ratelimit.h>
@@ -75,8 +76,13 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
* allow SOCK_MEMALLOC sockets to use it as this socket is
* helping free memory
*/
- if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
+ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
return -ENOMEM;
+ }
+ err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
+ if (err)
+ return err;
err = security_sock_rcv_skb(sk, skb);
if (err)
@@ -1411,8 +1417,8 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_STACK,
- .arg4_type = ARG_CONST_STACK_SIZE,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE,
.arg5_type = ARG_ANYTHING,
};
@@ -1442,8 +1448,8 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_RAW_STACK,
- .arg4_type = ARG_CONST_STACK_SIZE,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
};
BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
@@ -1517,10 +1523,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
{
bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
+ bool do_mforce = flags & BPF_F_MARK_ENFORCE;
__sum16 *ptr;
- if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
- BPF_F_HDR_FIELD_MASK)))
+ if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
+ BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
return -EINVAL;
if (unlikely(offset > 0xffff || offset & 1))
return -EFAULT;
@@ -1528,7 +1535,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
return -EFAULT;
ptr = (__sum16 *)(skb->data + offset);
- if (is_mmzero && !*ptr)
+ if (is_mmzero && !do_mforce && !*ptr)
return 0;
switch (flags & BPF_F_HDR_FIELD_MASK) {
@@ -1596,10 +1603,10 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
.gpl_only = false,
.pkt_access = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_STACK,
- .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO,
- .arg3_type = ARG_PTR_TO_STACK,
- .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE_OR_ZERO,
.arg5_type = ARG_ANYTHING,
};
@@ -1684,6 +1691,12 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
u32 flags)
{
+ /* Verify that a link layer header is carried */
+ if (unlikely(skb->mac_header >= skb->network_header)) {
+ kfree_skb(skb);
+ return -ERANGE;
+ }
+
bpf_push_mac_rcsum(skb);
return flags & BPF_F_INGRESS ?
__bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
@@ -1692,17 +1705,10 @@ static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
u32 flags)
{
- switch (dev->type) {
- case ARPHRD_TUNNEL:
- case ARPHRD_TUNNEL6:
- case ARPHRD_SIT:
- case ARPHRD_IPGRE:
- case ARPHRD_VOID:
- case ARPHRD_NONE:
- return __bpf_redirect_no_mac(skb, dev, flags);
- default:
+ if (dev_is_mac_header_xmit(dev))
return __bpf_redirect_common(skb, dev, flags);
- }
+ else
+ return __bpf_redirect_no_mac(skb, dev, flags);
}
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
@@ -2190,16 +2196,79 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
.arg3_type = ARG_ANYTHING,
};
-bool bpf_helper_changes_skb_data(void *func)
+BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
+ u64, flags)
+{
+ u32 max_len = __bpf_skb_max_len(skb);
+ u32 new_len = skb->len + head_room;
+ int ret;
+
+ if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
+ new_len < skb->len))
+ return -EINVAL;
+
+ ret = skb_cow(skb, head_room);
+ if (likely(!ret)) {
+ /* Idea for this helper is that we currently only
+ * allow to expand on mac header. This means that
+ * skb->protocol network header, etc, stay as is.
+ * Compared to bpf_skb_change_tail(), we're more
+ * flexible due to not needing to linearize or
+ * reset GSO. Intention for this helper is to be
+ * used by an L3 skb that needs to push mac header
+ * for redirection into L2 device.
+ */
+ __skb_push(skb, head_room);
+ memset(skb->data, 0, head_room);
+ skb_reset_mac_header(skb);
+ }
+
+ bpf_compute_data_end(skb);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_change_head_proto = {
+ .func = bpf_skb_change_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
+{
+ void *data = xdp->data + offset;
+
+ if (unlikely(data < xdp->data_hard_start ||
+ data > xdp->data_end - ETH_HLEN))
+ return -EINVAL;
+
+ xdp->data = data;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
+ .func = bpf_xdp_adjust_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+bool bpf_helper_changes_pkt_data(void *func)
{
if (func == bpf_skb_vlan_push ||
func == bpf_skb_vlan_pop ||
func == bpf_skb_store_bytes ||
func == bpf_skb_change_proto ||
+ func == bpf_skb_change_head ||
func == bpf_skb_change_tail ||
func == bpf_skb_pull_data ||
func == bpf_l3_csum_replace ||
- func == bpf_l4_csum_replace)
+ func == bpf_l4_csum_replace ||
+ func == bpf_xdp_adjust_head)
return true;
return false;
@@ -2239,8 +2308,8 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_STACK,
- .arg5_type = ARG_CONST_STACK_SIZE,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE,
};
static unsigned short bpf_tunnel_key_af(u64 flags)
@@ -2310,8 +2379,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_RAW_STACK,
- .arg3_type = ARG_CONST_STACK_SIZE,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
};
@@ -2345,8 +2414,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_RAW_STACK,
- .arg3_type = ARG_CONST_STACK_SIZE,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
};
static struct metadata_dst __percpu *md_dst;
@@ -2416,8 +2485,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_STACK,
- .arg3_type = ARG_CONST_STACK_SIZE,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
};
@@ -2442,8 +2511,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_STACK,
- .arg3_type = ARG_CONST_STACK_SIZE,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
};
static const struct bpf_func_proto *
@@ -2515,8 +2584,8 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
return -EFAULT;
- return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size,
- bpf_xdp_copy);
+ return bpf_event_output(map, flags, meta, meta_size, xdp->data,
+ xdp_size, bpf_xdp_copy);
}
static const struct bpf_func_proto bpf_xdp_event_output_proto = {
@@ -2526,12 +2595,12 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_STACK,
- .arg5_type = ARG_CONST_STACK_SIZE,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE,
};
static const struct bpf_func_proto *
-sk_filter_func_proto(enum bpf_func_id func_id)
+bpf_base_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
@@ -2544,6 +2613,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_raw_smp_processor_id_proto;
+ case BPF_FUNC_get_numa_node_id:
+ return &bpf_get_numa_node_id_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_ktime_get_ns:
@@ -2557,6 +2628,17 @@ sk_filter_func_proto(enum bpf_func_id func_id)
}
static const struct bpf_func_proto *
+sk_filter_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
tc_cls_act_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -2611,7 +2693,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_skb_under_cgroup:
return &bpf_skb_under_cgroup_proto;
default:
- return sk_filter_func_proto(func_id);
+ return bpf_base_func_proto(func_id);
}
}
@@ -2623,20 +2705,106 @@ xdp_func_proto(enum bpf_func_id func_id)
return &bpf_xdp_event_output_proto;
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_xdp_adjust_head:
+ return &bpf_xdp_adjust_head_proto;
default:
- return sk_filter_func_proto(func_id);
+ return bpf_base_func_proto(func_id);
}
}
-static bool __is_valid_access(int off, int size, enum bpf_access_type type)
+static const struct bpf_func_proto *
+cg_skb_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_inout_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_proto;
+ case BPF_FUNC_get_route_realm:
+ return &bpf_get_route_realm_proto;
+ case BPF_FUNC_get_hash_recalc:
+ return &bpf_get_hash_recalc_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_skb_under_cgroup:
+ return &bpf_skb_under_cgroup_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_xmit_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_get_tunnel_key:
+ return &bpf_skb_get_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_key:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_skb_get_tunnel_opt:
+ return &bpf_skb_get_tunnel_opt_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_redirect:
+ return &bpf_redirect_proto;
+ case BPF_FUNC_clone_redirect:
+ return &bpf_clone_redirect_proto;
+ case BPF_FUNC_skb_change_tail:
+ return &bpf_skb_change_tail_proto;
+ case BPF_FUNC_skb_change_head:
+ return &bpf_skb_change_head_proto;
+ case BPF_FUNC_skb_store_bytes:
+ return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_csum_update:
+ return &bpf_csum_update_proto;
+ case BPF_FUNC_l3_csum_replace:
+ return &bpf_l3_csum_replace_proto;
+ case BPF_FUNC_l4_csum_replace:
+ return &bpf_l4_csum_replace_proto;
+ case BPF_FUNC_set_hash_invalid:
+ return &bpf_set_hash_invalid_proto;
+ default:
+ return lwt_inout_func_proto(func_id);
+ }
+}
+
+static bool __is_valid_access(int off, int size)
{
if (off < 0 || off >= sizeof(struct __sk_buff))
return false;
+
/* The verifier guarantees that size > 0. */
if (off % size != 0)
return false;
- if (size != sizeof(__u32))
- return false;
+
+ switch (off) {
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+ if (off + size >
+ offsetof(struct __sk_buff, cb[4]) + sizeof(__u32))
+ return false;
+ break;
+ default:
+ if (size != sizeof(__u32))
+ return false;
+ }
return true;
}
@@ -2655,14 +2823,71 @@ static bool sk_filter_is_valid_access(int off, int size,
if (type == BPF_WRITE) {
switch (off) {
case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]):
+ offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+ break;
+ default:
+ return false;
+ }
+ }
+
+ return __is_valid_access(off, size);
+}
+
+static bool lwt_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
+{
+ switch (off) {
+ case offsetof(struct __sk_buff, tc_classid):
+ return false;
+ }
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct __sk_buff, mark):
+ case offsetof(struct __sk_buff, priority):
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case offsetof(struct __sk_buff, data):
+ *reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct __sk_buff, data_end):
+ *reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return __is_valid_access(off, size);
+}
+
+static bool sock_filter_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
+{
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct bpf_sock, bound_dev_if):
break;
default:
return false;
}
}
- return __is_valid_access(off, size, type);
+ if (off < 0 || off + size > sizeof(struct bpf_sock))
+ return false;
+ /* The verifier guarantees that size > 0. */
+ if (off % size != 0)
+ return false;
+ if (size != sizeof(__u32))
+ return false;
+
+ return true;
}
static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
@@ -2714,7 +2939,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
case offsetof(struct __sk_buff, tc_index):
case offsetof(struct __sk_buff, priority):
case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]):
+ offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
case offsetof(struct __sk_buff, tc_classid):
break;
default:
@@ -2731,11 +2956,10 @@ static bool tc_cls_act_is_valid_access(int off, int size,
break;
}
- return __is_valid_access(off, size, type);
+ return __is_valid_access(off, size);
}
-static bool __is_valid_xdp_access(int off, int size,
- enum bpf_access_type type)
+static bool __is_valid_xdp_access(int off, int size)
{
if (off < 0 || off >= sizeof(struct xdp_md))
return false;
@@ -2763,7 +2987,7 @@ static bool xdp_is_valid_access(int off, int size,
break;
}
- return __is_valid_xdp_access(off, size, type);
+ return __is_valid_xdp_access(off, size);
}
void bpf_warn_invalid_xdp_action(u32 act)
@@ -2772,32 +2996,33 @@ void bpf_warn_invalid_xdp_action(u32 act)
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
-static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
- int src_reg, int ctx_off,
- struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+static u32 bpf_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
{
struct bpf_insn *insn = insn_buf;
+ int off;
- switch (ctx_off) {
+ switch (si->off) {
case offsetof(struct __sk_buff, len):
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, len));
break;
case offsetof(struct __sk_buff, protocol):
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
- *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, protocol));
break;
case offsetof(struct __sk_buff, vlan_proto):
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
- *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, vlan_proto));
break;
@@ -2805,17 +3030,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, priority));
else
- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, priority));
break;
case offsetof(struct __sk_buff, ingress_ifindex):
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, skb_iif));
break;
@@ -2823,17 +3048,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
- dst_reg, src_reg,
+ si->dst_reg, si->src_reg,
offsetof(struct sk_buff, dev));
- *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct net_device, ifindex));
break;
case offsetof(struct __sk_buff, hash):
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, hash));
break;
@@ -2841,63 +3066,77 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, mark));
else
- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, mark));
break;
case offsetof(struct __sk_buff, pkt_type):
- return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn);
+ return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg,
+ si->src_reg, insn);
case offsetof(struct __sk_buff, queue_mapping):
- return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn);
+ return convert_skb_access(SKF_AD_QUEUE, si->dst_reg,
+ si->src_reg, insn);
case offsetof(struct __sk_buff, vlan_present):
return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
- dst_reg, src_reg, insn);
+ si->dst_reg, si->src_reg, insn);
case offsetof(struct __sk_buff, vlan_tci):
return convert_skb_access(SKF_AD_VLAN_TAG,
- dst_reg, src_reg, insn);
+ si->dst_reg, si->src_reg, insn);
case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]):
+ offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
+ BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
+ offsetof(struct qdisc_skb_cb, data)) %
+ sizeof(__u64));
prog->cb_access = 1;
- ctx_off -= offsetof(struct __sk_buff, cb[0]);
- ctx_off += offsetof(struct sk_buff, cb);
- ctx_off += offsetof(struct qdisc_skb_cb, data);
+ off = si->off;
+ off -= offsetof(struct __sk_buff, cb[0]);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct qdisc_skb_cb, data);
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+ *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg,
+ si->src_reg, off);
else
- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
+ si->src_reg, off);
break;
case offsetof(struct __sk_buff, tc_classid):
- ctx_off -= offsetof(struct __sk_buff, tc_classid);
- ctx_off += offsetof(struct sk_buff, cb);
- ctx_off += offsetof(struct qdisc_skb_cb, tc_classid);
+ BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2);
+
+ off = si->off;
+ off -= offsetof(struct __sk_buff, tc_classid);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct qdisc_skb_cb, tc_classid);
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+ *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg,
+ si->src_reg, off);
else
- *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
+ si->src_reg, off);
break;
case offsetof(struct __sk_buff, data):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
- dst_reg, src_reg,
+ si->dst_reg, si->src_reg,
offsetof(struct sk_buff, data));
break;
case offsetof(struct __sk_buff, data_end):
- ctx_off -= offsetof(struct __sk_buff, data_end);
- ctx_off += offsetof(struct sk_buff, cb);
- ctx_off += offsetof(struct bpf_skb_data_end, data_end);
- *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg,
- ctx_off);
+ off = si->off;
+ off -= offsetof(struct __sk_buff, data_end);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct bpf_skb_data_end, data_end);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+ si->src_reg, off);
break;
case offsetof(struct __sk_buff, tc_index):
@@ -2905,65 +3144,107 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg,
+ *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, tc_index));
else
- *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
offsetof(struct sk_buff, tc_index));
- break;
#else
if (type == BPF_WRITE)
- *insn++ = BPF_MOV64_REG(dst_reg, dst_reg);
+ *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
else
- *insn++ = BPF_MOV64_IMM(dst_reg, 0);
- break;
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#endif
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sock, bound_dev_if):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_bound_dev_if));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_bound_dev_if));
+ break;
+
+ case offsetof(struct bpf_sock, family):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_family));
+ break;
+
+ case offsetof(struct bpf_sock, type):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, __sk_flags_offset));
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
+ break;
+
+ case offsetof(struct bpf_sock, protocol):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, __sk_flags_offset));
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT);
+ break;
}
return insn - insn_buf;
}
-static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
- int src_reg, int ctx_off,
+static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog)
{
struct bpf_insn *insn = insn_buf;
- switch (ctx_off) {
+ switch (si->off) {
case offsetof(struct __sk_buff, ifindex):
BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
- dst_reg, src_reg,
+ si->dst_reg, si->src_reg,
offsetof(struct sk_buff, dev));
- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct net_device, ifindex));
break;
default:
- return sk_filter_convert_ctx_access(type, dst_reg, src_reg,
- ctx_off, insn_buf, prog);
+ return bpf_convert_ctx_access(type, si, insn_buf, prog);
}
return insn - insn_buf;
}
-static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
- int src_reg, int ctx_off,
+static u32 xdp_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog)
{
struct bpf_insn *insn = insn_buf;
- switch (ctx_off) {
+ switch (si->off) {
case offsetof(struct xdp_md, data):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
- dst_reg, src_reg,
+ si->dst_reg, si->src_reg,
offsetof(struct xdp_buff, data));
break;
case offsetof(struct xdp_md, data_end):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
- dst_reg, src_reg,
+ si->dst_reg, si->src_reg,
offsetof(struct xdp_buff, data_end));
break;
}
@@ -2974,7 +3255,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
static const struct bpf_verifier_ops sk_filter_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
- .convert_ctx_access = sk_filter_convert_ctx_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
};
static const struct bpf_verifier_ops tc_cls_act_ops = {
@@ -2990,32 +3271,87 @@ static const struct bpf_verifier_ops xdp_ops = {
.convert_ctx_access = xdp_convert_ctx_access,
};
-static struct bpf_prog_type_list sk_filter_type __read_mostly = {
+static const struct bpf_verifier_ops cg_skb_ops = {
+ .get_func_proto = cg_skb_func_proto,
+ .is_valid_access = sk_filter_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+static const struct bpf_verifier_ops lwt_inout_ops = {
+ .get_func_proto = lwt_inout_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+static const struct bpf_verifier_ops lwt_xmit_ops = {
+ .get_func_proto = lwt_xmit_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+ .gen_prologue = tc_cls_act_prologue,
+};
+
+static const struct bpf_verifier_ops cg_sock_ops = {
+ .get_func_proto = bpf_base_func_proto,
+ .is_valid_access = sock_filter_is_valid_access,
+ .convert_ctx_access = sock_filter_convert_ctx_access,
+};
+
+static struct bpf_prog_type_list sk_filter_type __ro_after_init = {
.ops = &sk_filter_ops,
.type = BPF_PROG_TYPE_SOCKET_FILTER,
};
-static struct bpf_prog_type_list sched_cls_type __read_mostly = {
+static struct bpf_prog_type_list sched_cls_type __ro_after_init = {
.ops = &tc_cls_act_ops,
.type = BPF_PROG_TYPE_SCHED_CLS,
};
-static struct bpf_prog_type_list sched_act_type __read_mostly = {
+static struct bpf_prog_type_list sched_act_type __ro_after_init = {
.ops = &tc_cls_act_ops,
.type = BPF_PROG_TYPE_SCHED_ACT,
};
-static struct bpf_prog_type_list xdp_type __read_mostly = {
+static struct bpf_prog_type_list xdp_type __ro_after_init = {
.ops = &xdp_ops,
.type = BPF_PROG_TYPE_XDP,
};
+static struct bpf_prog_type_list cg_skb_type __ro_after_init = {
+ .ops = &cg_skb_ops,
+ .type = BPF_PROG_TYPE_CGROUP_SKB,
+};
+
+static struct bpf_prog_type_list lwt_in_type __ro_after_init = {
+ .ops = &lwt_inout_ops,
+ .type = BPF_PROG_TYPE_LWT_IN,
+};
+
+static struct bpf_prog_type_list lwt_out_type __ro_after_init = {
+ .ops = &lwt_inout_ops,
+ .type = BPF_PROG_TYPE_LWT_OUT,
+};
+
+static struct bpf_prog_type_list lwt_xmit_type __ro_after_init = {
+ .ops = &lwt_xmit_ops,
+ .type = BPF_PROG_TYPE_LWT_XMIT,
+};
+
+static struct bpf_prog_type_list cg_sock_type __ro_after_init = {
+ .ops = &cg_sock_ops,
+ .type = BPF_PROG_TYPE_CGROUP_SOCK
+};
+
static int __init register_sk_filter_ops(void)
{
bpf_register_prog_type(&sk_filter_type);
bpf_register_prog_type(&sched_cls_type);
bpf_register_prog_type(&sched_act_type);
bpf_register_prog_type(&xdp_type);
+ bpf_register_prog_type(&cg_skb_type);
+ bpf_register_prog_type(&cg_sock_type);
+ bpf_register_prog_type(&lwt_in_type);
+ bpf_register_prog_type(&lwt_out_type);
+ bpf_register_prog_type(&lwt_xmit_type);
return 0;
}
diff --git a/net/core/flow.c b/net/core/flow.c
index 18e8893d4be5..f765c11d8df5 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -417,28 +417,20 @@ static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
return 0;
}
-static int flow_cache_cpu(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int flow_cache_cpu_up_prep(unsigned int cpu, struct hlist_node *node)
{
- struct flow_cache *fc = container_of(nfb, struct flow_cache,
- hotcpu_notifier);
- int res, cpu = (unsigned long) hcpu;
+ struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node);
+
+ return flow_cache_cpu_prepare(fc, cpu);
+}
+
+static int flow_cache_cpu_dead(unsigned int cpu, struct hlist_node *node)
+{
+ struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node);
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- res = flow_cache_cpu_prepare(fc, cpu);
- if (res)
- return notifier_from_errno(res);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- __flow_cache_shrink(fc, fcp, 0);
- break;
- }
- return NOTIFY_OK;
+ __flow_cache_shrink(fc, fcp, 0);
+ return 0;
}
int flow_cache_init(struct net *net)
@@ -465,18 +457,8 @@ int flow_cache_init(struct net *net)
if (!fc->percpu)
return -ENOMEM;
- cpu_notifier_register_begin();
-
- for_each_online_cpu(i) {
- if (flow_cache_cpu_prepare(fc, i))
- goto err;
- }
- fc->hotcpu_notifier = (struct notifier_block){
- .notifier_call = flow_cache_cpu,
- };
- __register_hotcpu_notifier(&fc->hotcpu_notifier);
-
- cpu_notifier_register_done();
+ if (cpuhp_state_add_instance(CPUHP_NET_FLOW_PREPARE, &fc->node))
+ goto err;
setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
(unsigned long) fc);
@@ -492,8 +474,6 @@ err:
fcp->hash_table = NULL;
}
- cpu_notifier_register_done();
-
free_percpu(fc->percpu);
fc->percpu = NULL;
@@ -507,7 +487,8 @@ void flow_cache_fini(struct net *net)
struct flow_cache *fc = &net->xfrm.flow_cache_global;
del_timer_sync(&fc->rnd_timer);
- unregister_hotcpu_notifier(&fc->hotcpu_notifier);
+
+ cpuhp_state_remove_instance_nocalls(CPUHP_NET_FLOW_PREPARE, &fc->node);
for_each_possible_cpu(i) {
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
@@ -519,3 +500,14 @@ void flow_cache_fini(struct net *net)
fc->percpu = NULL;
}
EXPORT_SYMBOL(flow_cache_fini);
+
+void __init flow_cache_hp_init(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_NET_FLOW_PREPARE,
+ "net/flow:prepare",
+ flow_cache_cpu_up_prep,
+ flow_cache_cpu_dead);
+ WARN_ON(ret < 0);
+}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index c6d8207ffa7e..d98d4998213d 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -58,6 +58,28 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
EXPORT_SYMBOL(skb_flow_dissector_init);
/**
+ * skb_flow_get_be16 - extract be16 entity
+ * @skb: sk_buff to extract from
+ * @poff: offset to extract at
+ * @data: raw buffer pointer to the packet
+ * @hlen: packet header length
+ *
+ * The function will try to retrieve a be32 entity at
+ * offset poff
+ */
+static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff,
+ void *data, int hlen)
+{
+ __be16 *u, _u;
+
+ u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u);
+ if (u)
+ return *u;
+
+ return 0;
+}
+
+/**
* __skb_flow_get_ports - extract the upper layer ports and return them
* @skb: sk_buff to extract the ports from
* @thoff: transport header offset
@@ -116,7 +138,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector_key_control *key_control;
struct flow_dissector_key_basic *key_basic;
struct flow_dissector_key_addrs *key_addrs;
+ struct flow_dissector_key_arp *key_arp;
struct flow_dissector_key_ports *key_ports;
+ struct flow_dissector_key_icmp *key_icmp;
struct flow_dissector_key_tags *key_tags;
struct flow_dissector_key_vlan *key_vlan;
struct flow_dissector_key_keyid *key_keyid;
@@ -356,6 +380,62 @@ mpls:
nhoff += FCOE_HEADER_LEN;
goto out_good;
+
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_RARP): {
+ struct {
+ unsigned char ar_sha[ETH_ALEN];
+ unsigned char ar_sip[4];
+ unsigned char ar_tha[ETH_ALEN];
+ unsigned char ar_tip[4];
+ } *arp_eth, _arp_eth;
+ const struct arphdr *arp;
+ struct arphdr _arp;
+
+ arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data,
+ hlen, &_arp);
+ if (!arp)
+ goto out_bad;
+
+ if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
+ arp->ar_pro != htons(ETH_P_IP) ||
+ arp->ar_hln != ETH_ALEN ||
+ arp->ar_pln != 4 ||
+ (arp->ar_op != htons(ARPOP_REPLY) &&
+ arp->ar_op != htons(ARPOP_REQUEST)))
+ goto out_bad;
+
+ arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
+ sizeof(_arp_eth), data,
+ hlen,
+ &_arp_eth);
+ if (!arp_eth)
+ goto out_bad;
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ARP)) {
+
+ key_arp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ARP,
+ target_container);
+
+ memcpy(&key_arp->sip, arp_eth->ar_sip,
+ sizeof(key_arp->sip));
+ memcpy(&key_arp->tip, arp_eth->ar_tip,
+ sizeof(key_arp->tip));
+
+ /* Only store the lower byte of the opcode;
+ * this covers ARPOP_REPLY and ARPOP_REQUEST.
+ */
+ key_arp->op = ntohs(arp->ar_op) & 0xff;
+
+ ether_addr_copy(key_arp->sha, arp_eth->ar_sha);
+ ether_addr_copy(key_arp->tha, arp_eth->ar_tha);
+ }
+
+ goto out_good;
+ }
+
default:
goto out_bad;
}
@@ -445,8 +525,9 @@ ip_proto_again:
if (hdr->flags & GRE_ACK)
offset += sizeof(((struct pptp_gre_header *)0)->ack);
- ppp_hdr = skb_header_pointer(skb, nhoff + offset,
- sizeof(_ppp_hdr), _ppp_hdr);
+ ppp_hdr = __skb_header_pointer(skb, nhoff + offset,
+ sizeof(_ppp_hdr),
+ data, hlen, _ppp_hdr);
if (!ppp_hdr)
goto out_bad;
@@ -546,6 +627,14 @@ ip_proto_again:
data, hlen);
}
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ICMP)) {
+ key_icmp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ICMP,
+ target_container);
+ key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen);
+ }
+
out_good:
ret = true;
@@ -726,7 +815,7 @@ EXPORT_SYMBOL(make_flow_keys_digest);
static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;
-u32 __skb_get_hash_symmetric(struct sk_buff *skb)
+u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
{
struct flow_keys keys;
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index cad8e791f28e..0385dece1f6f 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -7,13 +7,14 @@
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Eric Dumazet <edumazet@google.com>
*
* Changes:
* Jamal Hadi Salim - moved it to net/core and reshulfed
* names to make it usable in general net subsystem.
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -30,165 +31,79 @@
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
-#include <linux/rbtree.h>
#include <linux/slab.h>
+#include <linux/seqlock.h>
#include <net/sock.h>
#include <net/gen_stats.h>
-/*
- This code is NOT intended to be used for statistics collection,
- its purpose is to provide a base for statistical multiplexing
- for controlled load service.
- If you need only statistics, run a user level daemon which
- periodically reads byte counters.
-
- Unfortunately, rate estimation is not a very easy task.
- F.e. I did not find a simple way to estimate the current peak rate
- and even failed to formulate the problem 8)8)
-
- So I preferred not to built an estimator into the scheduler,
- but run this task separately.
- Ideally, it should be kernel thread(s), but for now it runs
- from timers, which puts apparent top bounds on the number of rated
- flows, has minimal overhead on small, but is enough
- to handle controlled load service, sets of aggregates.
-
- We measure rate over A=(1<<interval) seconds and evaluate EWMA:
-
- avrate = avrate*(1-W) + rate*W
-
- where W is chosen as negative power of 2: W = 2^(-ewma_log)
-
- The resulting time constant is:
-
- T = A/(-ln(1-W))
-
-
- NOTES.
-
- * avbps and avpps are scaled by 2^5.
- * both values are reported as 32 bit unsigned values. bps can
- overflow for fast links : max speed being 34360Mbit/sec
- * Minimal interval is HZ/4=250msec (it is the greatest common divisor
- for HZ=100 and HZ=1024 8)), maximal interval
- is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
- are too expensive, longer ones can be implemented
- at user level painlessly.
+/* This code is NOT intended to be used for statistics collection,
+ * its purpose is to provide a base for statistical multiplexing
+ * for controlled load service.
+ * If you need only statistics, run a user level daemon which
+ * periodically reads byte counters.
*/
-#define EST_MAX_INTERVAL 5
-
-struct gen_estimator
-{
- struct list_head list;
+struct net_rate_estimator {
struct gnet_stats_basic_packed *bstats;
- struct gnet_stats_rate_est64 *rate_est;
spinlock_t *stats_lock;
seqcount_t *running;
- int ewma_log;
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+ u8 ewma_log;
+ u8 intvl_log; /* period : (250ms << intvl_log) */
+
+ seqcount_t seq;
u32 last_packets;
- unsigned long avpps;
u64 last_bytes;
+
+ u64 avpps;
u64 avbps;
- struct rcu_head e_rcu;
- struct rb_node node;
- struct gnet_stats_basic_cpu __percpu *cpu_bstats;
- struct rcu_head head;
-};
-struct gen_estimator_head
-{
- struct timer_list timer;
- struct list_head list;
+ unsigned long next_jiffies;
+ struct timer_list timer;
+ struct rcu_head rcu;
};
-static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
-
-/* Protects against NULL dereference */
-static DEFINE_RWLOCK(est_lock);
-
-/* Protects against soft lockup during large deletion */
-static struct rb_root est_root = RB_ROOT;
-static DEFINE_SPINLOCK(est_tree_lock);
-
-static void est_timer(unsigned long arg)
+static void est_fetch_counters(struct net_rate_estimator *e,
+ struct gnet_stats_basic_packed *b)
{
- int idx = (int)arg;
- struct gen_estimator *e;
+ if (e->stats_lock)
+ spin_lock(e->stats_lock);
- rcu_read_lock();
- list_for_each_entry_rcu(e, &elist[idx].list, list) {
- struct gnet_stats_basic_packed b = {0};
- unsigned long rate;
- u64 brate;
-
- if (e->stats_lock)
- spin_lock(e->stats_lock);
- read_lock(&est_lock);
- if (e->bstats == NULL)
- goto skip;
-
- __gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);
-
- brate = (b.bytes - e->last_bytes)<<(7 - idx);
- e->last_bytes = b.bytes;
- e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
- WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);
-
- rate = b.packets - e->last_packets;
- rate <<= (7 - idx);
- e->last_packets = b.packets;
- e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
- WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
-skip:
- read_unlock(&est_lock);
- if (e->stats_lock)
- spin_unlock(e->stats_lock);
- }
+ __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats);
+
+ if (e->stats_lock)
+ spin_unlock(e->stats_lock);
- if (!list_empty(&elist[idx].list))
- mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
- rcu_read_unlock();
}
-static void gen_add_node(struct gen_estimator *est)
+static void est_timer(unsigned long arg)
{
- struct rb_node **p = &est_root.rb_node, *parent = NULL;
+ struct net_rate_estimator *est = (struct net_rate_estimator *)arg;
+ struct gnet_stats_basic_packed b;
+ u64 rate, brate;
- while (*p) {
- struct gen_estimator *e;
+ est_fetch_counters(est, &b);
+ brate = (b.bytes - est->last_bytes) << (8 - est->ewma_log);
+ brate -= (est->avbps >> est->ewma_log);
- parent = *p;
- e = rb_entry(parent, struct gen_estimator, node);
+ rate = (u64)(b.packets - est->last_packets) << (8 - est->ewma_log);
+ rate -= (est->avpps >> est->ewma_log);
- if (est->bstats > e->bstats)
- p = &parent->rb_right;
- else
- p = &parent->rb_left;
- }
- rb_link_node(&est->node, parent, p);
- rb_insert_color(&est->node, &est_root);
-}
+ write_seqcount_begin(&est->seq);
+ est->avbps += brate;
+ est->avpps += rate;
+ write_seqcount_end(&est->seq);
-static
-struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est64 *rate_est)
-{
- struct rb_node *p = est_root.rb_node;
-
- while (p) {
- struct gen_estimator *e;
+ est->last_bytes = b.bytes;
+ est->last_packets = b.packets;
- e = rb_entry(p, struct gen_estimator, node);
+ est->next_jiffies += ((HZ/4) << est->intvl_log);
- if (bstats > e->bstats)
- p = p->rb_right;
- else if (bstats < e->bstats || rate_est != e->rate_est)
- p = p->rb_left;
- else
- return e;
+ if (unlikely(time_after_eq(jiffies, est->next_jiffies))) {
+ /* Ouch... timer was delayed. */
+ est->next_jiffies = jiffies + 1;
}
- return NULL;
+ mod_timer(&est->timer, est->next_jiffies);
}
/**
@@ -211,83 +126,76 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
*/
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **rate_est,
spinlock_t *stats_lock,
seqcount_t *running,
struct nlattr *opt)
{
- struct gen_estimator *est;
struct gnet_estimator *parm = nla_data(opt);
- struct gnet_stats_basic_packed b = {0};
- int idx;
+ struct net_rate_estimator *old, *est;
+ struct gnet_stats_basic_packed b;
+ int intvl_log;
if (nla_len(opt) < sizeof(*parm))
return -EINVAL;
+ /* allowed timer periods are :
+ * -2 : 250ms, -1 : 500ms, 0 : 1 sec
+ * 1 : 2 sec, 2 : 4 sec, 3 : 8 sec
+ */
if (parm->interval < -2 || parm->interval > 3)
return -EINVAL;
est = kzalloc(sizeof(*est), GFP_KERNEL);
- if (est == NULL)
+ if (!est)
return -ENOBUFS;
- __gnet_stats_copy_basic(running, &b, cpu_bstats, bstats);
-
- idx = parm->interval + 2;
+ seqcount_init(&est->seq);
+ intvl_log = parm->interval + 2;
est->bstats = bstats;
- est->rate_est = rate_est;
est->stats_lock = stats_lock;
est->running = running;
est->ewma_log = parm->ewma_log;
- est->last_bytes = b.bytes;
- est->avbps = rate_est->bps<<5;
- est->last_packets = b.packets;
- est->avpps = rate_est->pps<<10;
+ est->intvl_log = intvl_log;
est->cpu_bstats = cpu_bstats;
- spin_lock_bh(&est_tree_lock);
- if (!elist[idx].timer.function) {
- INIT_LIST_HEAD(&elist[idx].list);
- setup_timer(&elist[idx].timer, est_timer, idx);
+ est_fetch_counters(est, &b);
+ est->last_bytes = b.bytes;
+ est->last_packets = b.packets;
+ old = rcu_dereference_protected(*rate_est, 1);
+ if (old) {
+ del_timer_sync(&old->timer);
+ est->avbps = old->avbps;
+ est->avpps = old->avpps;
}
- if (list_empty(&elist[idx].list))
- mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
-
- list_add_rcu(&est->list, &elist[idx].list);
- gen_add_node(est);
- spin_unlock_bh(&est_tree_lock);
+ est->next_jiffies = jiffies + ((HZ/4) << intvl_log);
+ setup_timer(&est->timer, est_timer, (unsigned long)est);
+ mod_timer(&est->timer, est->next_jiffies);
+ rcu_assign_pointer(*rate_est, est);
+ if (old)
+ kfree_rcu(old, rcu);
return 0;
}
EXPORT_SYMBOL(gen_new_estimator);
/**
* gen_kill_estimator - remove a rate estimator
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
*
- * Removes the rate estimator specified by &bstats and &rate_est.
+ * Removes the rate estimator.
*
- * Note : Caller should respect an RCU grace period before freeing stats_lock
*/
-void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_rate_est64 *rate_est)
+void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)
{
- struct gen_estimator *e;
-
- spin_lock_bh(&est_tree_lock);
- while ((e = gen_find_node(bstats, rate_est))) {
- rb_erase(&e->node, &est_root);
+ struct net_rate_estimator *est;
- write_lock(&est_lock);
- e->bstats = NULL;
- write_unlock(&est_lock);
-
- list_del_rcu(&e->list);
- kfree_rcu(e, e_rcu);
+ est = xchg((__force struct net_rate_estimator **)rate_est, NULL);
+ if (est) {
+ del_timer_sync(&est->timer);
+ kfree_rcu(est, rcu);
}
- spin_unlock_bh(&est_tree_lock);
}
EXPORT_SYMBOL(gen_kill_estimator);
@@ -307,33 +215,47 @@ EXPORT_SYMBOL(gen_kill_estimator);
*/
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **rate_est,
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt)
{
- gen_kill_estimator(bstats, rate_est);
- return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
+ return gen_new_estimator(bstats, cpu_bstats, rate_est,
+ stats_lock, running, opt);
}
EXPORT_SYMBOL(gen_replace_estimator);
/**
* gen_estimator_active - test if estimator is currently in use
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
*
* Returns true if estimator is active, and false if not.
*/
-bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est64 *rate_est)
+bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est)
{
- bool res;
+ return !!rcu_access_pointer(*rate_est);
+}
+EXPORT_SYMBOL(gen_estimator_active);
- ASSERT_RTNL();
+bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est,
+ struct gnet_stats_rate_est64 *sample)
+{
+ struct net_rate_estimator *est;
+ unsigned seq;
+
+ rcu_read_lock();
+ est = rcu_dereference(*rate_est);
+ if (!est) {
+ rcu_read_unlock();
+ return false;
+ }
- spin_lock_bh(&est_tree_lock);
- res = gen_find_node(bstats, rate_est) != NULL;
- spin_unlock_bh(&est_tree_lock);
+ do {
+ seq = read_seqcount_begin(&est->seq);
+ sample->bps = est->avbps >> 8;
+ sample->pps = est->avpps >> 8;
+ } while (read_seqcount_retry(&est->seq, seq));
- return res;
+ rcu_read_unlock();
+ return true;
}
-EXPORT_SYMBOL(gen_estimator_active);
+EXPORT_SYMBOL(gen_estimator_read);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 508e051304fb..87f28557b329 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -194,8 +194,7 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
/**
* gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
* @d: dumping handle
- * @b: basic statistics
- * @r: rate estimator statistics
+ * @rate_est: rate estimator
*
* Appends the rate estimator statistics to the top level TLV created by
* gnet_stats_start_copy().
@@ -205,18 +204,17 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
*/
int
gnet_stats_copy_rate_est(struct gnet_dump *d,
- const struct gnet_stats_basic_packed *b,
- struct gnet_stats_rate_est64 *r)
+ struct net_rate_estimator __rcu **rate_est)
{
+ struct gnet_stats_rate_est64 sample;
struct gnet_stats_rate_est est;
int res;
- if (b && !gen_estimator_active(b, r))
+ if (!gen_estimator_read(rate_est, &sample))
return 0;
-
- est.bps = min_t(u64, UINT_MAX, r->bps);
+ est.bps = min_t(u64, UINT_MAX, sample.bps);
/* we have some time before reaching 2^32 packets per second */
- est.pps = r->pps;
+ est.pps = sample.pps;
if (d->compat_tc_stats) {
d->tc_stats.bps = est.bps;
@@ -226,11 +224,11 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
if (d->tail) {
res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est),
TCA_STATS_PAD);
- if (res < 0 || est.bps == r->bps)
+ if (res < 0 || est.bps == sample.bps)
return res;
/* emit 64bit stats only if needed */
- return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r),
- TCA_STATS_PAD);
+ return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample,
+ sizeof(sample), TCA_STATS_PAD);
}
return 0;
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
new file mode 100644
index 000000000000..c98bbfbd26b8
--- /dev/null
+++ b/net/core/gro_cells.c
@@ -0,0 +1,92 @@
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <net/gro_cells.h>
+
+struct gro_cell {
+ struct sk_buff_head napi_skbs;
+ struct napi_struct napi;
+};
+
+int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct gro_cell *cell;
+
+ if (!gcells->cells || skb_cloned(skb) || !(dev->features & NETIF_F_GRO))
+ return netif_rx(skb);
+
+ cell = this_cpu_ptr(gcells->cells);
+
+ if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) {
+ atomic_long_inc(&dev->rx_dropped);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ __skb_queue_tail(&cell->napi_skbs, skb);
+ if (skb_queue_len(&cell->napi_skbs) == 1)
+ napi_schedule(&cell->napi);
+ return NET_RX_SUCCESS;
+}
+EXPORT_SYMBOL(gro_cells_receive);
+
+/* called under BH context */
+static int gro_cell_poll(struct napi_struct *napi, int budget)
+{
+ struct gro_cell *cell = container_of(napi, struct gro_cell, napi);
+ struct sk_buff *skb;
+ int work_done = 0;
+
+ while (work_done < budget) {
+ skb = __skb_dequeue(&cell->napi_skbs);
+ if (!skb)
+ break;
+ napi_gro_receive(napi, skb);
+ work_done++;
+ }
+
+ if (work_done < budget)
+ napi_complete_done(napi, work_done);
+ return work_done;
+}
+
+int gro_cells_init(struct gro_cells *gcells, struct net_device *dev)
+{
+ int i;
+
+ gcells->cells = alloc_percpu(struct gro_cell);
+ if (!gcells->cells)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
+
+ __skb_queue_head_init(&cell->napi_skbs);
+
+ set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state);
+
+ netif_napi_add(dev, &cell->napi, gro_cell_poll,
+ NAPI_POLL_WEIGHT);
+ napi_enable(&cell->napi);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(gro_cells_init);
+
+void gro_cells_destroy(struct gro_cells *gcells)
+{
+ int i;
+
+ if (!gcells->cells)
+ return;
+ for_each_possible_cpu(i) {
+ struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
+
+ netif_napi_del(&cell->napi);
+ __skb_queue_purge(&cell->napi_skbs);
+ }
+ free_percpu(gcells->cells);
+ gcells->cells = NULL;
+}
+EXPORT_SYMBOL(gro_cells_destroy);
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
new file mode 100644
index 000000000000..0cfe7b0216c3
--- /dev/null
+++ b/net/core/lwt_bpf.c
@@ -0,0 +1,397 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <net/lwtunnel.h>
+
+struct bpf_lwt_prog {
+ struct bpf_prog *prog;
+ char *name;
+};
+
+struct bpf_lwt {
+ struct bpf_lwt_prog in;
+ struct bpf_lwt_prog out;
+ struct bpf_lwt_prog xmit;
+ int family;
+};
+
+#define MAX_PROG_NAME 256
+
+static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return (struct bpf_lwt *)lwt->data;
+}
+
+#define NO_REDIRECT false
+#define CAN_REDIRECT true
+
+static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
+ struct dst_entry *dst, bool can_redirect)
+{
+ int ret;
+
+ /* Preempt disable is needed to protect per-cpu redirect_info between
+ * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
+ * access to maps strictly require a rcu_read_lock() for protection,
+ * mixing with BH RCU lock doesn't work.
+ */
+ preempt_disable();
+ rcu_read_lock();
+ bpf_compute_data_end(skb);
+ ret = bpf_prog_run_save_cb(lwt->prog, skb);
+ rcu_read_unlock();
+
+ switch (ret) {
+ case BPF_OK:
+ break;
+
+ case BPF_REDIRECT:
+ if (unlikely(!can_redirect)) {
+ pr_warn_once("Illegal redirect return code in prog %s\n",
+ lwt->name ? : "<unknown>");
+ ret = BPF_OK;
+ } else {
+ ret = skb_do_redirect(skb);
+ if (ret == 0)
+ ret = BPF_REDIRECT;
+ }
+ break;
+
+ case BPF_DROP:
+ kfree_skb(skb);
+ ret = -EPERM;
+ break;
+
+ default:
+ pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret);
+ kfree_skb(skb);
+ ret = -EINVAL;
+ break;
+ }
+
+ preempt_enable();
+
+ return ret;
+}
+
+static int bpf_input(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+ int ret;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->in.prog) {
+ ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (unlikely(!dst->lwtstate->orig_input)) {
+ pr_warn_once("orig_input not set on dst for prog %s\n",
+ bpf->out.name);
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return dst->lwtstate->orig_input(skb);
+}
+
+static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+ int ret;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->out.prog) {
+ ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (unlikely(!dst->lwtstate->orig_output)) {
+ pr_warn_once("orig_output not set on dst for prog %s\n",
+ bpf->out.name);
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return dst->lwtstate->orig_output(net, sk, skb);
+}
+
+static int xmit_check_hhlen(struct sk_buff *skb)
+{
+ int hh_len = skb_dst(skb)->dev->hard_header_len;
+
+ if (skb_headroom(skb) < hh_len) {
+ int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
+
+ if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC))
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int bpf_xmit(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->xmit.prog) {
+ int ret;
+
+ ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
+ switch (ret) {
+ case BPF_OK:
+ /* If the header was expanded, headroom might be too
+ * small for L2 header to come, expand as needed.
+ */
+ ret = xmit_check_hhlen(skb);
+ if (unlikely(ret))
+ return ret;
+
+ return LWTUNNEL_XMIT_CONTINUE;
+ case BPF_REDIRECT:
+ return LWTUNNEL_XMIT_DONE;
+ default:
+ return ret;
+ }
+ }
+
+ return LWTUNNEL_XMIT_CONTINUE;
+}
+
+static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog)
+{
+ if (prog->prog)
+ bpf_prog_put(prog->prog);
+
+ kfree(prog->name);
+}
+
+static void bpf_destroy_state(struct lwtunnel_state *lwt)
+{
+ struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+ bpf_lwt_prog_destroy(&bpf->in);
+ bpf_lwt_prog_destroy(&bpf->out);
+ bpf_lwt_prog_destroy(&bpf->xmit);
+}
+
+static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
+ [LWT_BPF_PROG_FD] = { .type = NLA_U32, },
+ [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+ .len = MAX_PROG_NAME },
+};
+
+static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
+ enum bpf_prog_type type)
+{
+ struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
+ struct bpf_prog *p;
+ int ret;
+ u32 fd;
+
+ ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
+ return -EINVAL;
+
+ prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL);
+ if (!prog->name)
+ return -ENOMEM;
+
+ fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
+ p = bpf_prog_get_type(fd, type);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ prog->prog = p;
+
+ return 0;
+}
+
+static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
+ [LWT_BPF_IN] = { .type = NLA_NESTED, },
+ [LWT_BPF_OUT] = { .type = NLA_NESTED, },
+ [LWT_BPF_XMIT] = { .type = NLA_NESTED, },
+ [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
+};
+
+static int bpf_build_state(struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts)
+{
+ struct nlattr *tb[LWT_BPF_MAX + 1];
+ struct lwtunnel_state *newts;
+ struct bpf_lwt *bpf;
+ int ret;
+
+ if (family != AF_INET && family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT])
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(sizeof(*bpf));
+ if (!newts)
+ return -ENOMEM;
+
+ newts->type = LWTUNNEL_ENCAP_BPF;
+ bpf = bpf_lwt_lwtunnel(newts);
+
+ if (tb[LWT_BPF_IN]) {
+ newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in,
+ BPF_PROG_TYPE_LWT_IN);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_OUT]) {
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out,
+ BPF_PROG_TYPE_LWT_OUT);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_XMIT]) {
+ newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit,
+ BPF_PROG_TYPE_LWT_XMIT);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_XMIT_HEADROOM]) {
+ u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]);
+
+ if (headroom > LWT_BPF_MAX_HEADROOM) {
+ ret = -ERANGE;
+ goto errout;
+ }
+
+ newts->headroom = headroom;
+ }
+
+ bpf->family = family;
+ *ts = newts;
+
+ return 0;
+
+errout:
+ bpf_destroy_state(newts);
+ kfree(newts);
+ return ret;
+}
+
+static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr,
+ struct bpf_lwt_prog *prog)
+{
+ struct nlattr *nest;
+
+ if (!prog->prog)
+ return 0;
+
+ nest = nla_nest_start(skb, attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (prog->name &&
+ nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name))
+ return -EMSGSIZE;
+
+ return nla_nest_end(skb, nest);
+}
+
+static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt)
+{
+ struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+ if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 ||
+ bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 ||
+ bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0)
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ int nest_len = nla_total_size(sizeof(struct nlattr)) +
+ nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */
+ 0;
+
+ return nest_len + /* LWT_BPF_IN */
+ nest_len + /* LWT_BPF_OUT */
+ nest_len + /* LWT_BPF_XMIT */
+ 0;
+}
+
+static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
+{
+ /* FIXME:
+ * The LWT state is currently rebuilt for delete requests which
+ * results in a new bpf_prog instance. Comparing names for now.
+ */
+ if (!a->name && !b->name)
+ return 0;
+
+ if (!a->name || !b->name)
+ return 1;
+
+ return strcmp(a->name, b->name);
+}
+
+static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a);
+ struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b);
+
+ return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) ||
+ bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) ||
+ bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit);
+}
+
+static const struct lwtunnel_encap_ops bpf_encap_ops = {
+ .build_state = bpf_build_state,
+ .destroy_state = bpf_destroy_state,
+ .input = bpf_input,
+ .output = bpf_output,
+ .xmit = bpf_xmit,
+ .fill_encap = bpf_fill_encap_info,
+ .get_encap_size = bpf_encap_nlsize,
+ .cmp_encap = bpf_encap_cmp,
+ .owner = THIS_MODULE,
+};
+
+static int __init bpf_lwt_init(void)
+{
+ return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
+}
+
+subsys_initcall(bpf_lwt_init)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index e5f84c26ba1a..6df9f8fabf0c 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -26,6 +26,7 @@
#include <net/lwtunnel.h>
#include <net/rtnetlink.h>
#include <net/ip6_fib.h>
+#include <net/nexthop.h>
#ifdef CONFIG_MODULES
@@ -39,6 +40,10 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
return "MPLS";
case LWTUNNEL_ENCAP_ILA:
return "ILA";
+ case LWTUNNEL_ENCAP_SEG6:
+ return "SEG6";
+ case LWTUNNEL_ENCAP_BPF:
+ return "BPF";
case LWTUNNEL_ENCAP_IP6:
case LWTUNNEL_ENCAP_IP:
case LWTUNNEL_ENCAP_NONE:
@@ -96,7 +101,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
}
EXPORT_SYMBOL(lwtunnel_encap_del_ops);
-int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+int lwtunnel_build_state(u16 encap_type,
struct nlattr *encap, unsigned int family,
const void *cfg, struct lwtunnel_state **lws)
{
@@ -110,25 +115,91 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[encap_type]);
+ if (likely(ops && ops->build_state && try_module_get(ops->owner))) {
+ ret = ops->build_state(encap, family, cfg, lws);
+ if (ret)
+ module_put(ops->owner);
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_build_state);
+
+int lwtunnel_valid_encap_type(u16 encap_type)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = -EINVAL;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE ||
+ encap_type > LWTUNNEL_ENCAP_MAX)
+ return ret;
+
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[encap_type]);
+ rcu_read_unlock();
#ifdef CONFIG_MODULES
if (!ops) {
const char *encap_type_str = lwtunnel_encap_str(encap_type);
if (encap_type_str) {
- rcu_read_unlock();
+ __rtnl_unlock();
request_module("rtnl-lwt-%s", encap_type_str);
+ rtnl_lock();
+
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[encap_type]);
+ rcu_read_unlock();
}
}
#endif
- if (likely(ops && ops->build_state))
- ret = ops->build_state(dev, encap, family, cfg, lws);
- rcu_read_unlock();
+ return ops ? 0 : -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(lwtunnel_valid_encap_type);
- return ret;
+int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining)
+{
+ struct rtnexthop *rtnh = (struct rtnexthop *)attr;
+ struct nlattr *nla_entype;
+ struct nlattr *attrs;
+ struct nlattr *nla;
+ u16 encap_type;
+ int attrlen;
+
+ while (rtnh_ok(rtnh, remaining)) {
+ attrlen = rtnh_attrlen(rtnh);
+ if (attrlen > 0) {
+ attrs = rtnh_attrs(rtnh);
+ nla = nla_find(attrs, attrlen, RTA_ENCAP);
+ nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+
+ if (nla_entype) {
+ encap_type = nla_get_u16(nla_entype);
+
+ if (lwtunnel_valid_encap_type(encap_type) != 0)
+ return -EOPNOTSUPP;
+ }
+ }
+ rtnh = rtnh_next(rtnh, &remaining);
+ }
+
+ return 0;
}
-EXPORT_SYMBOL(lwtunnel_build_state);
+EXPORT_SYMBOL(lwtunnel_valid_encap_type_attr);
+
+void lwtstate_free(struct lwtunnel_state *lws)
+{
+ const struct lwtunnel_encap_ops *ops = lwtun_encaps[lws->type];
+
+ if (ops->destroy_state) {
+ ops->destroy_state(lws);
+ kfree_rcu(lws, rcu);
+ } else {
+ kfree(lws);
+ }
+ module_put(ops->owner);
+}
+EXPORT_SYMBOL(lwtstate_free);
int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
{
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 2ae929f9bd06..4526cbd7e28a 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -100,6 +100,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)
neigh->parms->neigh_cleanup(neigh);
__neigh_notify(neigh, RTM_DELNEIGH, 0);
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
neigh_release(neigh);
}
@@ -859,7 +860,8 @@ static void neigh_probe(struct neighbour *neigh)
if (skb)
skb = skb_clone(skb, GFP_ATOMIC);
write_unlock(&neigh->lock);
- neigh->ops->solicit(neigh, skb);
+ if (neigh->ops->solicit)
+ neigh->ops->solicit(neigh, skb);
atomic_inc(&neigh->probes);
kfree_skb(skb);
}
@@ -2291,13 +2293,10 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
n != NULL;
n = rcu_dereference_bh(n->next)) {
- if (!net_eq(dev_net(n->dev), net))
- continue;
- if (neigh_ifindex_filtered(n->dev, filter_idx))
- continue;
- if (neigh_master_filtered(n->dev, filter_master_idx))
- continue;
- if (idx < s_idx)
+ if (idx < s_idx || !net_eq(dev_net(n->dev), net))
+ goto next;
+ if (neigh_ifindex_filtered(n->dev, filter_idx) ||
+ neigh_master_filtered(n->dev, filter_master_idx))
goto next;
if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
@@ -2332,9 +2331,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
if (h > s_h)
s_idx = 0;
for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
- if (pneigh_net(n) != net)
- continue;
- if (idx < s_idx)
+ if (idx < s_idx || pneigh_net(n) != net)
goto next;
if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
@@ -2927,7 +2924,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write)
return;
set_bit(index, p->data_state);
- call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
+ if (index == NEIGH_VAR_DELAY_PROBE_TIME)
+ call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
if (!dev) /* NULL dev means this is default value */
neigh_copy_dflt_parms(net, p, index);
}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 6e4f34721080..65ea0ff4017c 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -15,6 +15,7 @@
#include <net/switchdev.h>
#include <linux/if_arp.h>
#include <linux/slab.h>
+#include <linux/sched/signal.h>
#include <linux/nsproxy.h>
#include <net/sock.h>
#include <net/net_namespace.h>
@@ -950,10 +951,13 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
}
while (--i >= new_num) {
+ struct kobject *kobj = &dev->_rx[i].kobj;
+
+ if (!atomic_read(&dev_net(dev)->count))
+ kobj->uevent_suppress = 1;
if (dev->sysfs_rx_queue_group)
- sysfs_remove_group(&dev->_rx[i].kobj,
- dev->sysfs_rx_queue_group);
- kobject_put(&dev->_rx[i].kobj);
+ sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
+ kobject_put(kobj);
}
return error;
@@ -1021,7 +1025,6 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue,
return sprintf(buf, "%lu", trans_timeout);
}
-#ifdef CONFIG_XPS
static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
{
struct net_device *dev = queue->dev;
@@ -1033,6 +1036,21 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
return i;
}
+static ssize_t show_traffic_class(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ char *buf)
+{
+ struct net_device *dev = queue->dev;
+ int index = get_netdev_queue_index(queue);
+ int tc = netdev_txq_to_tc(dev, index);
+
+ if (tc < 0)
+ return -EINVAL;
+
+ return sprintf(buf, "%u\n", tc);
+}
+
+#ifdef CONFIG_XPS
static ssize_t show_tx_maxrate(struct netdev_queue *queue,
struct netdev_queue_attribute *attribute,
char *buf)
@@ -1075,6 +1093,9 @@ static struct netdev_queue_attribute queue_tx_maxrate =
static struct netdev_queue_attribute queue_trans_timeout =
__ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
+static struct netdev_queue_attribute queue_traffic_class =
+ __ATTR(traffic_class, S_IRUGO, show_traffic_class, NULL);
+
#ifdef CONFIG_BQL
/*
* Byte queue limits sysfs structures and functions.
@@ -1190,29 +1211,38 @@ static ssize_t show_xps_map(struct netdev_queue *queue,
struct netdev_queue_attribute *attribute, char *buf)
{
struct net_device *dev = queue->dev;
+ int cpu, len, num_tc = 1, tc = 0;
struct xps_dev_maps *dev_maps;
cpumask_var_t mask;
unsigned long index;
- int i, len;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
index = get_netdev_queue_index(queue);
+ if (dev->num_tc) {
+ num_tc = dev->num_tc;
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0)
+ return -EINVAL;
+ }
+
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_maps);
if (dev_maps) {
- for_each_possible_cpu(i) {
- struct xps_map *map =
- rcu_dereference(dev_maps->cpu_map[i]);
- if (map) {
- int j;
- for (j = 0; j < map->len; j++) {
- if (map->queues[j] == index) {
- cpumask_set_cpu(i, mask);
- break;
- }
+ for_each_possible_cpu(cpu) {
+ int i, tci = cpu * num_tc + tc;
+ struct xps_map *map;
+
+ map = rcu_dereference(dev_maps->cpu_map[tci]);
+ if (!map)
+ continue;
+
+ for (i = map->len; i--;) {
+ if (map->queues[i] == index) {
+ cpumask_set_cpu(cpu, mask);
+ break;
}
}
}
@@ -1260,6 +1290,7 @@ static struct netdev_queue_attribute xps_cpus_attribute =
static struct attribute *netdev_queue_default_attrs[] = {
&queue_trans_timeout.attr,
+ &queue_traffic_class.attr,
#ifdef CONFIG_XPS
&xps_cpus_attribute.attr,
&queue_tx_maxrate.attr,
@@ -1340,6 +1371,8 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
while (--i >= new_num) {
struct netdev_queue *queue = dev->_tx + i;
+ if (!atomic_read(&dev_net(dev)->count))
+ queue->kobj.uevent_suppress = 1;
#ifdef CONFIG_BQL
sysfs_remove_group(&queue->kobj, &dql_group);
#endif
@@ -1525,6 +1558,9 @@ void netdev_unregister_kobject(struct net_device *ndev)
{
struct device *dev = &(ndev->dev);
+ if (!atomic_read(&dev_net(ndev)->count))
+ dev_set_uevent_suppress(dev, 1);
+
kobject_get(&dev->kobj);
remove_queue_kobjects(ndev);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 7001da910c6b..652468ff65b7 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -16,6 +16,8 @@
#include <linux/export.h>
#include <linux/user_namespace.h>
#include <linux/net_namespace.h>
+#include <linux/sched/task.h>
+
#include <net/sock.h>
#include <net/netlink.h>
#include <net/net_namespace.h>
@@ -39,6 +41,9 @@ EXPORT_SYMBOL(init_net);
static bool init_net_initialized;
+#define MIN_PERNET_OPS_ID \
+ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
+
#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
@@ -46,27 +51,28 @@ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
static struct net_generic *net_alloc_generic(void)
{
struct net_generic *ng;
- size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
+ unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
ng = kzalloc(generic_size, GFP_KERNEL);
if (ng)
- ng->len = max_gen_ptrs;
+ ng->s.len = max_gen_ptrs;
return ng;
}
-static int net_assign_generic(struct net *net, int id, void *data)
+static int net_assign_generic(struct net *net, unsigned int id, void *data)
{
struct net_generic *ng, *old_ng;
BUG_ON(!mutex_is_locked(&net_mutex));
- BUG_ON(id == 0);
+ BUG_ON(id < MIN_PERNET_OPS_ID);
old_ng = rcu_dereference_protected(net->gen,
lockdep_is_held(&net_mutex));
- ng = old_ng;
- if (old_ng->len >= id)
- goto assign;
+ if (old_ng->s.len > id) {
+ old_ng->ptr[id] = data;
+ return 0;
+ }
ng = net_alloc_generic();
if (ng == NULL)
@@ -83,12 +89,12 @@ static int net_assign_generic(struct net *net, int id, void *data)
* the old copy for kfree after a grace period.
*/
- memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
+ memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID],
+ (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *));
+ ng->ptr[id] = data;
rcu_assign_pointer(net->gen, ng);
- kfree_rcu(old_ng, rcu);
-assign:
- ng->ptr[id - 1] = data;
+ kfree_rcu(old_ng, s.rcu);
return 0;
}
@@ -122,8 +128,7 @@ out:
static void ops_free(const struct pernet_operations *ops, struct net *net)
{
if (ops->id && ops->size) {
- int id = *ops->id;
- kfree(net_generic(net, id));
+ kfree(net_generic(net, *ops->id));
}
}
@@ -215,16 +220,15 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id);
*/
int peernet2id_alloc(struct net *net, struct net *peer)
{
- unsigned long flags;
bool alloc;
int id;
if (atomic_read(&net->count) == 0)
return NETNSA_NSID_NOT_ASSIGNED;
- spin_lock_irqsave(&net->nsid_lock, flags);
+ spin_lock_bh(&net->nsid_lock);
alloc = atomic_read(&peer->count) == 0 ? false : true;
id = __peernet2id_alloc(net, peer, &alloc);
- spin_unlock_irqrestore(&net->nsid_lock, flags);
+ spin_unlock_bh(&net->nsid_lock);
if (alloc && id >= 0)
rtnl_net_notifyid(net, RTM_NEWNSID, id);
return id;
@@ -233,12 +237,11 @@ int peernet2id_alloc(struct net *net, struct net *peer)
/* This function returns, if assigned, the id of a peer netns. */
int peernet2id(struct net *net, struct net *peer)
{
- unsigned long flags;
int id;
- spin_lock_irqsave(&net->nsid_lock, flags);
+ spin_lock_bh(&net->nsid_lock);
id = __peernet2id(net, peer);
- spin_unlock_irqrestore(&net->nsid_lock, flags);
+ spin_unlock_bh(&net->nsid_lock);
return id;
}
EXPORT_SYMBOL(peernet2id);
@@ -253,18 +256,17 @@ bool peernet_has_id(struct net *net, struct net *peer)
struct net *get_net_ns_by_id(struct net *net, int id)
{
- unsigned long flags;
struct net *peer;
if (id < 0)
return NULL;
rcu_read_lock();
- spin_lock_irqsave(&net->nsid_lock, flags);
+ spin_lock_bh(&net->nsid_lock);
peer = idr_find(&net->netns_ids, id);
if (peer)
get_net(peer);
- spin_unlock_irqrestore(&net->nsid_lock, flags);
+ spin_unlock_bh(&net->nsid_lock);
rcu_read_unlock();
return peer;
@@ -384,7 +386,14 @@ struct net *copy_net_ns(unsigned long flags,
get_user_ns(user_ns);
- mutex_lock(&net_mutex);
+ rv = mutex_lock_killable(&net_mutex);
+ if (rv < 0) {
+ net_free(net);
+ dec_net_namespaces(ucounts);
+ put_user_ns(user_ns);
+ return ERR_PTR(rv);
+ }
+
net->ucounts = ucounts;
rv = setup_net(net, user_ns);
if (rv == 0) {
@@ -427,17 +436,17 @@ static void cleanup_net(struct work_struct *work)
for_each_net(tmp) {
int id;
- spin_lock_irq(&tmp->nsid_lock);
+ spin_lock_bh(&tmp->nsid_lock);
id = __peernet2id(tmp, net);
if (id >= 0)
idr_remove(&tmp->netns_ids, id);
- spin_unlock_irq(&tmp->nsid_lock);
+ spin_unlock_bh(&tmp->nsid_lock);
if (id >= 0)
rtnl_net_notifyid(tmp, RTM_DELNSID, id);
}
- spin_lock_irq(&net->nsid_lock);
+ spin_lock_bh(&net->nsid_lock);
idr_destroy(&net->netns_ids);
- spin_unlock_irq(&net->nsid_lock);
+ spin_unlock_bh(&net->nsid_lock);
}
rtnl_unlock();
@@ -566,7 +575,6 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[NETNSA_MAX + 1];
- unsigned long flags;
struct net *peer;
int nsid, err;
@@ -587,15 +595,15 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
if (IS_ERR(peer))
return PTR_ERR(peer);
- spin_lock_irqsave(&net->nsid_lock, flags);
+ spin_lock_bh(&net->nsid_lock);
if (__peernet2id(net, peer) >= 0) {
- spin_unlock_irqrestore(&net->nsid_lock, flags);
+ spin_unlock_bh(&net->nsid_lock);
err = -EEXIST;
goto out;
}
err = alloc_netid(net, peer, nsid);
- spin_unlock_irqrestore(&net->nsid_lock, flags);
+ spin_unlock_bh(&net->nsid_lock);
if (err >= 0) {
rtnl_net_notifyid(net, RTM_NEWNSID, err);
err = 0;
@@ -717,11 +725,10 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
.idx = 0,
.s_idx = cb->args[0],
};
- unsigned long flags;
- spin_lock_irqsave(&net->nsid_lock, flags);
+ spin_lock_bh(&net->nsid_lock);
idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
- spin_unlock_irqrestore(&net->nsid_lock, flags);
+ spin_unlock_bh(&net->nsid_lock);
cb->args[0] = net_cb.idx;
return skb->len;
@@ -868,7 +875,7 @@ static int register_pernet_operations(struct list_head *list,
if (ops->id) {
again:
- error = ida_get_new_above(&net_generic_ids, 1, ops->id);
+ error = ida_get_new_above(&net_generic_ids, MIN_PERNET_OPS_ID, ops->id);
if (error < 0) {
if (error == -EAGAIN) {
ida_pre_get(&net_generic_ids, GFP_KERNEL);
@@ -876,7 +883,7 @@ again:
}
return error;
}
- max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);
+ max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1);
}
error = __register_pernet_operations(list, ops);
if (error) {
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 11fce17274f6..029a61ac6cdd 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -12,6 +12,8 @@
#include <linux/slab.h>
#include <linux/cgroup.h>
#include <linux/fdtable.h>
+#include <linux/sched/task.h>
+
#include <net/cls_cgroup.h>
#include <net/sock.h>
@@ -69,27 +71,17 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
return 0;
}
-static void update_classid(struct cgroup_subsys_state *css, void *v)
+static void cgrp_attach(struct cgroup_taskset *tset)
{
- struct css_task_iter it;
+ struct cgroup_subsys_state *css;
struct task_struct *p;
- css_task_iter_start(css, &it);
- while ((p = css_task_iter_next(&it))) {
+ cgroup_taskset_for_each(p, css, tset) {
task_lock(p);
- iterate_fd(p->files, 0, update_classid_sock, v);
+ iterate_fd(p->files, 0, update_classid_sock,
+ (void *)(unsigned long)css_cls_state(css)->classid);
task_unlock(p);
}
- css_task_iter_end(&it);
-}
-
-static void cgrp_attach(struct cgroup_taskset *tset)
-{
- struct cgroup_subsys_state *css;
-
- cgroup_taskset_first(tset, &css);
- update_classid(css,
- (void *)(unsigned long)css_cls_state(css)->classid);
}
static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
@@ -101,12 +93,22 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
u64 value)
{
struct cgroup_cls_state *cs = css_cls_state(css);
+ struct css_task_iter it;
+ struct task_struct *p;
cgroup_sk_alloc_disable();
cs->classid = (u32)value;
- update_classid(css, (void *)(unsigned long)cs->classid);
+ css_task_iter_start(css, &it);
+ while ((p = css_task_iter_next(&it))) {
+ task_lock(p);
+ iterate_fd(p->files, 0, update_classid_sock,
+ (void *)(unsigned long)cs->classid);
+ task_unlock(p);
+ }
+ css_task_iter_end(&it);
+
return 0;
}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 53599bd0c82d..29be2466970c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -105,15 +105,21 @@ static void queue_process(struct work_struct *work)
while ((skb = skb_dequeue(&npinfo->txq))) {
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
+ unsigned int q_index;
if (!netif_device_present(dev) || !netif_running(dev)) {
kfree_skb(skb);
continue;
}
- txq = skb_get_tx_queue(dev, skb);
-
local_irq_save(flags);
+ /* check if skb->queue_mapping is still valid */
+ q_index = skb_get_queue_mapping(skb);
+ if (unlikely(q_index >= dev->real_num_tx_queues)) {
+ q_index = q_index % dev->real_num_tx_queues;
+ skb_set_queue_mapping(skb, q_index);
+ }
+ txq = netdev_get_tx_queue(dev, q_index);
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (netif_xmit_frozen_or_stopped(txq) ||
netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {
@@ -171,12 +177,12 @@ static void poll_one_napi(struct napi_struct *napi)
static void poll_napi(struct net_device *dev)
{
struct napi_struct *napi;
+ int cpu = smp_processor_id();
list_for_each_entry(napi, &dev->napi_list, dev_list) {
- if (napi->poll_owner != smp_processor_id() &&
- spin_trylock(&napi->poll_lock)) {
+ if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) {
poll_one_napi(napi);
- spin_unlock(&napi->poll_lock);
+ smp_store_release(&napi->poll_owner, -1);
}
}
}
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 2ec86fc552df..0f9275ee5595 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -13,12 +13,15 @@
#include <linux/slab.h>
#include <linux/types.h>
+#include <linux/module.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/cgroup.h>
#include <linux/rcupdate.h>
#include <linux/atomic.h>
+#include <linux/sched/task.h>
+
#include <net/rtnetlink.h>
#include <net/pkt_cls.h>
#include <net/sock.h>
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 306b8f0e03c1..96947f5d41e4 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -413,7 +413,7 @@ struct pktgen_hdr {
};
-static int pg_net_id __read_mostly;
+static unsigned int pg_net_id __read_mostly;
struct pktgen_net {
struct net *net;
@@ -3439,9 +3439,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
/* skb was 'freed' by stack, so clean few
* bits and reuse it
*/
-#ifdef CONFIG_NET_CLS_ACT
- skb->tc_verd = 0; /* reset reclass/redir ttl */
-#endif
+ skb_reset_tc(skb);
} while (--burst > 0);
goto out; /* Skips xmit_mode M_START_XMIT */
} else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 5d26056b6d8f..9b8727c67b58 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -34,8 +34,6 @@
* and it will increase in proportion to the memory of machine.
* Note : Dont forget somaxconn that may limit backlog too.
*/
-int sysctl_max_syn_backlog = 256;
-EXPORT_SYMBOL(sysctl_max_syn_backlog);
void reqsk_queue_alloc(struct request_sock_queue *queue)
{
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a6196cf844f6..c4e84c558240 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -40,7 +40,7 @@
#include <linux/pci.h>
#include <linux/etherdevice.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
@@ -837,8 +837,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
static inline int rtnl_vfinfo_size(const struct net_device *dev,
u32 ext_filter_mask)
{
- if (dev->dev.parent && dev_is_pci(dev->dev.parent) &&
- (ext_filter_mask & RTEXT_FILTER_VF)) {
+ if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) {
int num_vfs = dev_num_vf(dev->dev.parent);
size_t size = nla_total_size(0);
size += num_vfs *
@@ -877,8 +876,6 @@ static size_t rtnl_port_size(const struct net_device *dev,
{
size_t port_size = nla_total_size(4) /* PORT_VF */
+ nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */
- + nla_total_size(sizeof(struct ifla_port_vsi))
- /* PORT_VSI_TYPE */
+ nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */
+ nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */
+ nla_total_size(1) /* PROT_VDP_REQUEST */
@@ -1492,19 +1489,25 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
[IFLA_PORT_VF] = { .type = NLA_U32 },
[IFLA_PORT_PROFILE] = { .type = NLA_STRING,
.len = PORT_PROFILE_MAX },
- [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY,
- .len = sizeof(struct ifla_port_vsi)},
[IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY,
.len = PORT_UUID_MAX },
[IFLA_PORT_HOST_UUID] = { .type = NLA_STRING,
.len = PORT_UUID_MAX },
[IFLA_PORT_REQUEST] = { .type = NLA_U8, },
[IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
+
+ /* Unused, but we need to keep it here since user space could
+ * fill it. It's also broken with regard to NLA_BINARY use in
+ * combination with structs.
+ */
+ [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_port_vsi) },
};
static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
[IFLA_XDP_FD] = { .type = NLA_S32 },
[IFLA_XDP_ATTACHED] = { .type = NLA_U8 },
+ [IFLA_XDP_FLAGS] = { .type = NLA_U32 },
};
static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
@@ -2164,6 +2167,7 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_XDP]) {
struct nlattr *xdp[IFLA_XDP_MAX + 1];
+ u32 xdp_flags = 0;
err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP],
ifla_xdp_policy);
@@ -2174,9 +2178,19 @@ static int do_setlink(const struct sk_buff *skb,
err = -EINVAL;
goto errout;
}
+
+ if (xdp[IFLA_XDP_FLAGS]) {
+ xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]);
+ if (xdp_flags & ~XDP_FLAGS_MASK) {
+ err = -EINVAL;
+ goto errout;
+ }
+ }
+
if (xdp[IFLA_XDP_FD]) {
err = dev_change_xdp_fd(dev,
- nla_get_s32(xdp[IFLA_XDP_FD]));
+ nla_get_s32(xdp[IFLA_XDP_FD]),
+ xdp_flags);
if (err)
goto errout;
status |= DO_SETLINK_NOTIFY;
@@ -2344,7 +2358,6 @@ struct net_device *rtnl_create_link(struct net *net,
const char *ifname, unsigned char name_assign_type,
const struct rtnl_link_ops *ops, struct nlattr *tb[])
{
- int err;
struct net_device *dev;
unsigned int num_tx_queues = 1;
unsigned int num_rx_queues = 1;
@@ -2359,11 +2372,10 @@ struct net_device *rtnl_create_link(struct net *net,
else if (ops->get_num_rx_queues)
num_rx_queues = ops->get_num_rx_queues();
- err = -ENOMEM;
dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
ops->setup, num_tx_queues, num_rx_queues);
if (!dev)
- goto err;
+ return ERR_PTR(-ENOMEM);
dev_net_set(dev, net);
dev->rtnl_link_ops = ops;
@@ -2389,9 +2401,6 @@ struct net_device *rtnl_create_link(struct net *net,
dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
return dev;
-
-err:
- return ERR_PTR(err);
}
EXPORT_SYMBOL(rtnl_create_link);
@@ -2559,7 +2568,7 @@ replay:
return -ENODEV;
}
- if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO])
+ if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
return -EOPNOTSUPP;
if (!ops) {
@@ -2641,6 +2650,11 @@ replay:
if (err < 0)
goto out_unregister;
}
+ if (tb[IFLA_MASTER]) {
+ err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
+ if (err)
+ goto out_unregister;
+ }
out:
if (link_net)
put_net(link_net);
@@ -3165,7 +3179,7 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc);
if (err)
goto out;
- nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc);
+ err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc);
out:
netif_addr_unlock_bh(dev);
return err;
@@ -3671,7 +3685,7 @@ static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
if (!size)
continue;
- if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+ if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
continue;
attr = nla_reserve_64bit(skb, attr_id, size,
@@ -3712,7 +3726,7 @@ static int rtnl_get_offload_stats_size(const struct net_device *dev)
for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
- if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+ if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
continue;
size = rtnl_get_offload_stats_attr_size(attr_id);
nla_size += nla_total_size_64bit(size);
@@ -3817,6 +3831,39 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
*idxattr = 0;
}
+ if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) {
+ struct rtnl_af_ops *af_ops;
+
+ *idxattr = IFLA_STATS_AF_SPEC;
+ attr = nla_nest_start(skb, IFLA_STATS_AF_SPEC);
+ if (!attr)
+ goto nla_put_failure;
+
+ list_for_each_entry(af_ops, &rtnl_af_ops, list) {
+ if (af_ops->fill_stats_af) {
+ struct nlattr *af;
+ int err;
+
+ af = nla_nest_start(skb, af_ops->family);
+ if (!af)
+ goto nla_put_failure;
+
+ err = af_ops->fill_stats_af(skb, dev);
+
+ if (err == -ENODATA)
+ nla_nest_cancel(skb, af);
+ else if (err < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, af);
+ }
+ }
+
+ nla_nest_end(skb, attr);
+
+ *idxattr = 0;
+ }
+
nlmsg_end(skb, nlh);
return 0;
@@ -3873,6 +3920,23 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
size += rtnl_get_offload_stats_size(dev);
+ if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) {
+ struct rtnl_af_ops *af_ops;
+
+ /* for IFLA_STATS_AF_SPEC */
+ size += nla_total_size(0);
+
+ list_for_each_entry(af_ops, &rtnl_af_ops, list) {
+ if (af_ops->get_stats_af_size) {
+ size += nla_total_size(
+ af_ops->get_stats_af_size(dev));
+
+ /* for AF_* */
+ size += nla_total_size(0);
+ }
+ }
+ }
+
return size;
}
@@ -3886,6 +3950,9 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh)
u32 filter_mask;
int err;
+ if (nlmsg_len(nlh) < sizeof(*ifsm))
+ return -EINVAL;
+
ifsm = nlmsg_data(nlh);
if (ifsm->ifindex > 0)
dev = __dev_get_by_index(net, ifsm->ifindex);
@@ -3935,6 +4002,9 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->seq = net->dev_base_seq;
+ if (nlmsg_len(cb->nlh) < sizeof(*ifsm))
+ return -EINVAL;
+
ifsm = nlmsg_data(cb->nlh);
filter_mask = ifsm->filter_mask;
if (!filter_mask)
diff --git a/net/core/scm.c b/net/core/scm.c
index 2696aefdc148..b1ff8a441748 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -14,6 +14,7 @@
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/sched.h>
+#include <linux/sched/user.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/stat.h>
@@ -29,7 +30,7 @@
#include <linux/nsproxy.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -71,7 +72,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
struct file **fpp;
int i, num;
- num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int);
+ num = (cmsg->cmsg_len - sizeof(struct cmsghdr))/sizeof(int);
if (num <= 0)
return 0;
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index fd3ce461fbe6..d28da7d363f1 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -1,3 +1,7 @@
+/*
+ * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/cryptohash.h>
@@ -8,17 +12,20 @@
#include <linux/ktime.h>
#include <linux/string.h>
#include <linux/net.h>
-
+#include <linux/siphash.h>
#include <net/secure_seq.h>
#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
-#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
+#include <linux/in6.h>
+#include <net/tcp.h>
-static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
+static siphash_key_t net_secret __read_mostly;
+static siphash_key_t ts_secret __read_mostly;
static __always_inline void net_secret_init(void)
{
- net_get_random_once(net_secret, sizeof(net_secret));
+ net_get_random_once(&ts_secret, sizeof(ts_secret));
+ net_get_random_once(&net_secret, sizeof(net_secret));
}
#endif
@@ -40,81 +47,98 @@ static u32 seq_scale(u32 seq)
#endif
#if IS_ENABLED(CONFIG_IPV6)
-__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
- __be16 sport, __be16 dport)
+static u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr)
{
- u32 secret[MD5_MESSAGE_BYTES / 4];
- u32 hash[MD5_DIGEST_WORDS];
- u32 i;
+ const struct {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .saddr = *(struct in6_addr *)saddr,
+ .daddr = *(struct in6_addr *)daddr,
+ };
+
+ if (sysctl_tcp_timestamps != 1)
+ return 0;
+
+ return siphash(&combined, offsetofend(typeof(combined), daddr),
+ &ts_secret);
+}
+u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
+ __be16 sport, __be16 dport, u32 *tsoff)
+{
+ const struct {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ __be16 sport;
+ __be16 dport;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .saddr = *(struct in6_addr *)saddr,
+ .daddr = *(struct in6_addr *)daddr,
+ .sport = sport,
+ .dport = dport
+ };
+ u64 hash;
net_secret_init();
- memcpy(hash, saddr, 16);
- for (i = 0; i < 4; i++)
- secret[i] = net_secret[i] + (__force u32)daddr[i];
- secret[4] = net_secret[4] +
- (((__force u16)sport << 16) + (__force u16)dport);
- for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
- secret[i] = net_secret[i];
-
- md5_transform(hash, secret);
-
- return seq_scale(hash[0]);
+ hash = siphash(&combined, offsetofend(typeof(combined), dport),
+ &net_secret);
+ *tsoff = secure_tcpv6_ts_off(saddr, daddr);
+ return seq_scale(hash);
}
EXPORT_SYMBOL(secure_tcpv6_sequence_number);
u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
__be16 dport)
{
- u32 secret[MD5_MESSAGE_BYTES / 4];
- u32 hash[MD5_DIGEST_WORDS];
- u32 i;
-
+ const struct {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ __be16 dport;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .saddr = *(struct in6_addr *)saddr,
+ .daddr = *(struct in6_addr *)daddr,
+ .dport = dport
+ };
net_secret_init();
- memcpy(hash, saddr, 16);
- for (i = 0; i < 4; i++)
- secret[i] = net_secret[i] + (__force u32) daddr[i];
- secret[4] = net_secret[4] + (__force u32)dport;
- for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
- secret[i] = net_secret[i];
-
- md5_transform(hash, secret);
-
- return hash[0];
+ return siphash(&combined, offsetofend(typeof(combined), dport),
+ &net_secret);
}
EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#endif
#ifdef CONFIG_INET
-
-__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport)
+static u32 secure_tcp_ts_off(__be32 saddr, __be32 daddr)
{
- u32 hash[MD5_DIGEST_WORDS];
+ if (sysctl_tcp_timestamps != 1)
+ return 0;
- net_secret_init();
- hash[0] = (__force u32)saddr;
- hash[1] = (__force u32)daddr;
- hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
- hash[3] = net_secret[15];
+ return siphash_2u32((__force u32)saddr, (__force u32)daddr,
+ &ts_secret);
+}
- md5_transform(hash, net_secret);
+/* secure_tcp_sequence_number(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d),
+ * but fortunately, `sport' cannot be 0 in any circumstances. If this changes,
+ * it would be easy enough to have the former function use siphash_4u32, passing
+ * the arguments as separate u32.
+ */
- return seq_scale(hash[0]);
+u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport, u32 *tsoff)
+{
+ u64 hash;
+ net_secret_init();
+ hash = siphash_3u32((__force u32)saddr, (__force u32)daddr,
+ (__force u32)sport << 16 | (__force u32)dport,
+ &net_secret);
+ *tsoff = secure_tcp_ts_off(saddr, daddr);
+ return seq_scale(hash);
}
u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
{
- u32 hash[MD5_DIGEST_WORDS];
-
net_secret_init();
- hash[0] = (__force u32)saddr;
- hash[1] = (__force u32)daddr;
- hash[2] = (__force u32)dport ^ net_secret[14];
- hash[3] = net_secret[15];
-
- md5_transform(hash, net_secret);
-
- return hash[0];
+ return siphash_3u32((__force u32)saddr, (__force u32)daddr,
+ (__force u16)dport, &net_secret);
}
EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
#endif
@@ -123,21 +147,13 @@ EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
__be16 sport, __be16 dport)
{
- u32 hash[MD5_DIGEST_WORDS];
u64 seq;
-
net_secret_init();
- hash[0] = (__force u32)saddr;
- hash[1] = (__force u32)daddr;
- hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
- hash[3] = net_secret[15];
-
- md5_transform(hash, net_secret);
-
- seq = hash[0] | (((u64)hash[1]) << 32);
+ seq = siphash_3u32((__force u32)saddr, (__force u32)daddr,
+ (__force u32)sport << 16 | (__force u32)dport,
+ &net_secret);
seq += ktime_get_real_ns();
seq &= (1ull << 48) - 1;
-
return seq;
}
EXPORT_SYMBOL(secure_dccp_sequence_number);
@@ -146,26 +162,23 @@ EXPORT_SYMBOL(secure_dccp_sequence_number);
u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
__be16 sport, __be16 dport)
{
- u32 secret[MD5_MESSAGE_BYTES / 4];
- u32 hash[MD5_DIGEST_WORDS];
+ const struct {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ __be16 sport;
+ __be16 dport;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .saddr = *(struct in6_addr *)saddr,
+ .daddr = *(struct in6_addr *)daddr,
+ .sport = sport,
+ .dport = dport
+ };
u64 seq;
- u32 i;
-
net_secret_init();
- memcpy(hash, saddr, 16);
- for (i = 0; i < 4; i++)
- secret[i] = net_secret[i] + (__force u32)daddr[i];
- secret[4] = net_secret[4] +
- (((__force u16)sport << 16) + (__force u16)dport);
- for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
- secret[i] = net_secret[i];
-
- md5_transform(hash, secret);
-
- seq = hash[0] | (((u64)hash[1]) << 32);
+ seq = siphash(&combined, offsetofend(typeof(combined), dport),
+ &net_secret);
seq += ktime_get_real_ns();
seq &= (1ull << 48) - 1;
-
return seq;
}
EXPORT_SYMBOL(secure_dccpv6_sequence_number);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1e3e0087245b..f86bf69cfb8d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -72,7 +72,7 @@
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <trace/events/skb.h>
#include <linux/highmem.h>
#include <linux/capability.h>
@@ -271,7 +271,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
atomic_set(&fclones->fclone_ref, 1);
fclones->skb2.fclone = SKB_FCLONE_CLONE;
- fclones->skb2.pfmemalloc = pfmemalloc;
}
out:
return skb;
@@ -354,7 +353,7 @@ EXPORT_SYMBOL(build_skb);
struct napi_alloc_cache {
struct page_frag_cache page;
- size_t skb_count;
+ unsigned int skb_count;
void *skb_cache[NAPI_SKB_CACHE_SIZE];
};
@@ -369,7 +368,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
local_irq_save(flags);
nc = this_cpu_ptr(&netdev_alloc_cache);
- data = __alloc_page_frag(nc, fragsz, gfp_mask);
+ data = page_frag_alloc(nc, fragsz, gfp_mask);
local_irq_restore(flags);
return data;
}
@@ -391,7 +390,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
+ return page_frag_alloc(&nc->page, fragsz, gfp_mask);
}
void *napi_alloc_frag(unsigned int fragsz)
@@ -441,7 +440,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
local_irq_save(flags);
nc = this_cpu_ptr(&netdev_alloc_cache);
- data = __alloc_page_frag(nc, len, gfp_mask);
+ data = page_frag_alloc(nc, len, gfp_mask);
pfmemalloc = nc->pfmemalloc;
local_irq_restore(flags);
@@ -505,7 +504,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- data = __alloc_page_frag(&nc->page, len, gfp_mask);
+ data = page_frag_alloc(&nc->page, len, gfp_mask);
if (unlikely(!data))
return NULL;
@@ -655,7 +654,7 @@ static void skb_release_head_state(struct sk_buff *skb)
skb->destructor(skb);
}
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- nf_conntrack_put(skb->nfct);
+ nf_conntrack_put(skb_nfct(skb));
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
nf_bridge_put(skb->nf_bridge);
@@ -878,9 +877,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
#endif
#ifdef CONFIG_NET_SCHED
CHECK_SKB_FIELD(tc_index);
-#ifdef CONFIG_NET_CLS_ACT
- CHECK_SKB_FIELD(tc_verd);
-#endif
#endif
}
@@ -1195,10 +1191,10 @@ EXPORT_SYMBOL(__pskb_copy_fclone);
int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
gfp_t gfp_mask)
{
- int i;
- u8 *data;
- int size = nhead + skb_end_offset(skb) + ntail;
+ int i, osize = skb_end_offset(skb);
+ int size = osize + nhead + ntail;
long off;
+ u8 *data;
BUG_ON(nhead < 0);
@@ -1260,6 +1256,14 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
skb->hdr_len = 0;
skb->nohdr = 0;
atomic_set(&skb_shinfo(skb)->dataref, 1);
+
+ /* It is not generally safe to change skb->truesize.
+ * For the moment, we really care of rx path, or
+ * when skb is orphaned (not attached to a socket).
+ */
+ if (!skb->sk || skb->destructor == sock_edemux)
+ skb->truesize += size - osize;
+
return 0;
nofrags:
@@ -2656,7 +2660,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
struct skb_frag_struct *fragfrom, *fragto;
BUG_ON(shiftlen > skb->len);
- BUG_ON(skb_headlen(skb)); /* Would corrupt stream */
+
+ if (skb_headlen(skb))
+ return 0;
todo = shiftlen;
from = 0;
@@ -3076,22 +3082,32 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
if (sg && csum && (mss != GSO_BY_FRAGS)) {
if (!(features & NETIF_F_GSO_PARTIAL)) {
struct sk_buff *iter;
+ unsigned int frag_len;
if (!list_skb ||
!net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
goto normal;
- /* Split the buffer at the frag_list pointer.
- * This is based on the assumption that all
- * buffers in the chain excluding the last
- * containing the same amount of data.
+ /* If we get here then all the required
+ * GSO features except frag_list are supported.
+ * Try to split the SKB to multiple GSO SKBs
+ * with no frag_list.
+ * Currently we can do that only when the buffers don't
+ * have a linear part and all the buffers except
+ * the last are of the same length.
*/
+ frag_len = list_skb->len;
skb_walk_frags(head_skb, iter) {
+ if (frag_len != iter->len && iter->next)
+ goto normal;
if (skb_headlen(iter))
goto normal;
len -= iter->len;
}
+
+ if (len != frag_len)
+ goto normal;
}
/* GSO partial only requires that we trim off any excess that
@@ -3688,6 +3704,15 @@ static void sock_rmem_free(struct sk_buff *skb)
atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
}
+static void skb_set_err_queue(struct sk_buff *skb)
+{
+ /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
+ * So, it is safe to (mis)use it to mark skbs on the error queue.
+ */
+ skb->pkt_type = PACKET_OUTGOING;
+ BUILD_BUG_ON(PACKET_OUTGOING == 0);
+}
+
/*
* Note: We dont mem charge error packets (no sk_forward_alloc changes)
*/
@@ -3701,6 +3726,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
skb->sk = sk;
skb->destructor = sock_rmem_free;
atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+ skb_set_err_queue(skb);
/* before exiting rcu section, make sure dst is refcounted */
skb_dst_force(skb);
@@ -3712,21 +3738,29 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_queue_err_skb);
+static bool is_icmp_err_skb(const struct sk_buff *skb)
+{
+ return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+ SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
+}
+
struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
{
struct sk_buff_head *q = &sk->sk_error_queue;
- struct sk_buff *skb, *skb_next;
+ struct sk_buff *skb, *skb_next = NULL;
+ bool icmp_next = false;
unsigned long flags;
- int err = 0;
spin_lock_irqsave(&q->lock, flags);
skb = __skb_dequeue(q);
if (skb && (skb_next = skb_peek(q)))
- err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
+ icmp_next = is_icmp_err_skb(skb_next);
spin_unlock_irqrestore(&q->lock, flags);
- sk->sk_err = err;
- if (err)
+ if (is_icmp_err_skb(skb) && !icmp_next)
+ sk->sk_err = 0;
+
+ if (skb_next)
sk->sk_error_report(sk);
return skb;
@@ -3769,16 +3803,21 @@ EXPORT_SYMBOL(skb_clone_sk);
static void __skb_complete_tx_timestamp(struct sk_buff *skb,
struct sock *sk,
- int tstype)
+ int tstype,
+ bool opt_stats)
{
struct sock_exterr_skb *serr;
int err;
+ BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
+
serr = SKB_EXT_ERR(skb);
memset(serr, 0, sizeof(*serr));
serr->ee.ee_errno = ENOMSG;
serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
serr->ee.ee_info = tstype;
+ serr->opt_stats = opt_stats;
+ serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
serr->ee.ee_data = skb_shinfo(skb)->tskey;
if (sk->sk_protocol == IPPROTO_TCP &&
@@ -3814,13 +3853,14 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
if (!skb_may_tx_timestamp(sk, false))
return;
- /* take a reference to prevent skb_orphan() from freeing the socket */
- sock_hold(sk);
-
- *skb_hwtstamps(skb) = *hwtstamps;
- __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND);
-
- sock_put(sk);
+ /* Take a reference to prevent skb_orphan() from freeing the socket,
+ * but only if the socket refcount is not zero.
+ */
+ if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
+ *skb_hwtstamps(skb) = *hwtstamps;
+ __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
+ sock_put(sk);
+ }
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
@@ -3829,7 +3869,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
struct sock *sk, int tstype)
{
struct sk_buff *skb;
- bool tsonly;
+ bool tsonly, opt_stats = false;
if (!sk)
return;
@@ -3838,10 +3878,19 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (!skb_may_tx_timestamp(sk, tsonly))
return;
- if (tsonly)
- skb = alloc_skb(0, GFP_ATOMIC);
- else
+ if (tsonly) {
+#ifdef CONFIG_INET
+ if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
+ sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM) {
+ skb = tcp_get_timestamping_opt_stats(sk);
+ opt_stats = true;
+ } else
+#endif
+ skb = alloc_skb(0, GFP_ATOMIC);
+ } else {
skb = skb_clone(orig_skb, GFP_ATOMIC);
+ }
if (!skb)
return;
@@ -3855,7 +3904,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
else
skb->tstamp = ktime_get_real();
- __skb_complete_tx_timestamp(skb, sk, tstype);
+ __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
@@ -3871,7 +3920,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
{
struct sock *sk = skb->sk;
struct sock_exterr_skb *serr;
- int err;
+ int err = 1;
skb->wifi_acked_valid = 1;
skb->wifi_acked = acked;
@@ -3881,14 +3930,15 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
serr->ee.ee_errno = ENOMSG;
serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
- /* take a reference to prevent skb_orphan() from freeing the socket */
- sock_hold(sk);
-
- err = sock_queue_err_skb(sk, skb);
+ /* Take a reference to prevent skb_orphan() from freeing the socket,
+ * but only if the socket refcount is not zero.
+ */
+ if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
+ err = sock_queue_err_skb(sk, skb);
+ sock_put(sk);
+ }
if (err)
kfree_skb(skb);
-
- sock_put(sk);
}
EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
@@ -4350,7 +4400,7 @@ EXPORT_SYMBOL(skb_try_coalesce);
*/
void skb_scrub_packet(struct sk_buff *skb, bool xnet)
{
- skb->tstamp.tv64 = 0;
+ skb->tstamp = 0;
skb->pkt_type = PACKET_HOST;
skb->skb_iif = 0;
skb->ignore_df = 0;
@@ -4913,3 +4963,35 @@ struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
return clone;
}
EXPORT_SYMBOL(pskb_extract);
+
+/**
+ * skb_condense - try to get rid of fragments/frag_list if possible
+ * @skb: buffer
+ *
+ * Can be used to save memory before skb is added to a busy queue.
+ * If packet has bytes in frags and enough tail room in skb->head,
+ * pull all of them, so that we can free the frags right now and adjust
+ * truesize.
+ * Notes:
+ * We do not reallocate skb->head thus can not fail.
+ * Caller must re-evaluate skb->truesize if needed.
+ */
+void skb_condense(struct sk_buff *skb)
+{
+ if (skb->data_len) {
+ if (skb->data_len > skb->end - skb->tail ||
+ skb_cloned(skb))
+ return;
+
+ /* Nice, we can free page frag(s) right now */
+ __pskb_pull_tail(skb, skb->data_len);
+ }
+ /* At this point, skb->truesize might be over estimated,
+ * because skb had a fragment, and fragments do not tell
+ * their truesize.
+ * When we pulled its content into skb->head, fragment
+ * was freed, but __pskb_pull_tail() could not possibly
+ * adjust skb->truesize, not knowing the frag truesize.
+ */
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+}
diff --git a/net/core/sock.c b/net/core/sock.c
index 00a074dbfe9b..2c4f574168fb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -118,7 +118,7 @@
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/netdevice.h>
#include <net/protocol.h>
@@ -197,66 +197,55 @@ EXPORT_SYMBOL(sk_net_capable);
/*
* Each address family might have different locking rules, so we have
- * one slock key per address family:
+ * one slock key per address family and separate keys for internal and
+ * userspace sockets.
*/
static struct lock_class_key af_family_keys[AF_MAX];
+static struct lock_class_key af_family_kern_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];
+static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
/*
* Make lock validator output more readable. (we pre-construct these
* strings build-time, so that runtime initialization of socket
* locks is fast):
*/
+
+#define _sock_locks(x) \
+ x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
+ x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
+ x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
+ x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
+ x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
+ x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
+ x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
+ x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
+ x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
+ x "27" , x "28" , x "AF_CAN" , \
+ x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
+ x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
+ x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
+ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
+ x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX"
+
static const char *const af_family_key_strings[AF_MAX+1] = {
- "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
- "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
- "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
- "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
- "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
- "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
- "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
- "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
- "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
- "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
- "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
- "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
- "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
- "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" ,
- "sk_lock-AF_MAX"
+ _sock_locks("sk_lock-")
};
static const char *const af_family_slock_key_strings[AF_MAX+1] = {
- "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
- "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
- "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
- "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
- "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
- "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
- "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
- "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
- "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
- "slock-27" , "slock-28" , "slock-AF_CAN" ,
- "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
- "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
- "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
- "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" ,
- "slock-AF_MAX"
+ _sock_locks("slock-")
};
static const char *const af_family_clock_key_strings[AF_MAX+1] = {
- "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
- "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
- "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
- "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
- "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
- "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
- "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
- "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
- "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
- "clock-27" , "clock-28" , "clock-AF_CAN" ,
- "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
- "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
- "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
- "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" ,
- "clock-AF_MAX"
+ _sock_locks("clock-")
+};
+
+static const char *const af_family_kern_key_strings[AF_MAX+1] = {
+ _sock_locks("k-sk_lock-")
+};
+static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
+ _sock_locks("k-slock-")
+};
+static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
+ _sock_locks("k-clock-")
};
/*
@@ -264,6 +253,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
* so split the lock classes by using a per-AF key:
*/
static struct lock_class_key af_callback_keys[AF_MAX];
+static struct lock_class_key af_kern_callback_keys[AF_MAX];
/* Take into consideration the size of the struct sk_buff overhead in the
* determination of these values, since that is non-constant across
@@ -367,7 +357,7 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
if (tv.tv_sec == 0 && tv.tv_usec == 0)
return 0;
if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
- *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
+ *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
return 0;
}
@@ -502,6 +492,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
sk_tx_queue_clear(sk);
+ sk->sk_dst_pending_confirm = 0;
RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
dst_release(dst);
return NULL;
@@ -762,11 +753,8 @@ set_rcvbuf:
goto set_rcvbuf;
case SO_KEEPALIVE:
-#ifdef CONFIG_INET
- if (sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM)
- tcp_set_keepalive(sk, valbool);
-#endif
+ if (sk->sk_prot->keepalive)
+ sk->sk_prot->keepalive(sk, valbool);
sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
break;
@@ -854,6 +842,13 @@ set_rcvbuf:
sk->sk_tskey = 0;
}
}
+
+ if (val & SOF_TIMESTAMPING_OPT_STATS &&
+ !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
+ ret = -EINVAL;
+ break;
+ }
+
sk->sk_tsflags = val;
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
sock_enable_timestamp(sk,
@@ -1141,7 +1136,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.tm.tv_usec = 0;
} else {
v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
- v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
+ v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
}
break;
@@ -1152,7 +1147,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.tm.tv_usec = 0;
} else {
v.tm.tv_sec = sk->sk_sndtimeo / HZ;
- v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
+ v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
}
break;
@@ -1288,7 +1283,16 @@ lenout:
*/
static inline void sock_lock_init(struct sock *sk)
{
- sock_lock_init_class_and_name(sk,
+ if (sk->sk_kern_sock)
+ sock_lock_init_class_and_name(
+ sk,
+ af_family_kern_slock_key_strings[sk->sk_family],
+ af_family_kern_slock_keys + sk->sk_family,
+ af_family_kern_key_strings[sk->sk_family],
+ af_family_kern_keys + sk->sk_family);
+ else
+ sock_lock_init_class_and_name(
+ sk,
af_family_slock_key_strings[sk->sk_family],
af_family_slock_keys + sk->sk_family,
af_family_key_strings[sk->sk_family],
@@ -1394,6 +1398,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
* why we need sk_prot_creator -acme
*/
sk->sk_prot = sk->sk_prot_creator = prot;
+ sk->sk_kern_sock = kern;
sock_lock_init(sk);
sk->sk_net_refcnt = kern ? 0 : 1;
if (likely(sk->sk_net_refcnt))
@@ -1437,6 +1442,11 @@ static void __sk_destruct(struct rcu_head *head)
pr_debug("%s: optmem leakage (%d bytes) detected\n",
__func__, atomic_read(&sk->sk_omem_alloc));
+ if (sk->sk_frag.page) {
+ put_page(sk->sk_frag.page);
+ sk->sk_frag.page = NULL;
+ }
+
if (sk->sk_peer_cred)
put_cred(sk->sk_peer_cred);
put_pid(sk->sk_peer_pid);
@@ -1515,6 +1525,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
af_family_clock_key_strings[newsk->sk_family]);
newsk->sk_dst_cache = NULL;
+ newsk->sk_dst_pending_confirm = 0;
newsk->sk_wmem_queued = 0;
newsk->sk_forward_alloc = 0;
atomic_set(&newsk->sk_drops, 0);
@@ -1533,11 +1544,13 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
is_charged = sk_filter_charge(newsk, filter);
if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
- /* It is still raw copy of parent, so invalidate
- * destructor and make plain sk_free() */
- newsk->sk_destruct = NULL;
- bh_unlock_sock(newsk);
- sk_free(newsk);
+ /* We need to make sure that we don't uncharge the new
+ * socket if we couldn't charge it in the first place
+ * as otherwise we uncharge the parent's filter.
+ */
+ if (!is_charged)
+ RCU_INIT_POINTER(newsk->sk_filter, NULL);
+ sk_free_unlock_clone(newsk);
newsk = NULL;
goto out;
}
@@ -1586,6 +1599,16 @@ out:
}
EXPORT_SYMBOL_GPL(sk_clone_lock);
+void sk_free_unlock_clone(struct sock *sk)
+{
+ /* It is still raw copy of parent, so invalidate
+ * destructor and make plain sk_free() */
+ sk->sk_destruct = NULL;
+ bh_unlock_sock(sk);
+ sk_free(sk);
+}
+EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
+
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
u32 max_segs = 1;
@@ -2080,37 +2103,31 @@ void __sk_flush_backlog(struct sock *sk)
*/
int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
int rc;
- DEFINE_WAIT(wait);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
+ rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
return rc;
}
EXPORT_SYMBOL(sk_wait_data);
/**
- * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ * __sk_mem_raise_allocated - increase memory_allocated
* @sk: socket
* @size: memory size to allocate
+ * @amt: pages to allocate
* @kind: allocation type
*
- * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
- * rmem allocation. This function assumes that protocols which have
- * memory_pressure use sk_wmem_queued as write buffer accounting.
+ * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
*/
-int __sk_mem_schedule(struct sock *sk, int size, int kind)
+int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
struct proto *prot = sk->sk_prot;
- int amt = sk_mem_pages(size);
- long allocated;
-
- sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
-
- allocated = sk_memory_allocated_add(sk, amt);
+ long allocated = sk_memory_allocated_add(sk, amt);
if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
!mem_cgroup_charge_skmem(sk->sk_memcg, amt))
@@ -2171,9 +2188,6 @@ suppress_allocation:
trace_sock_exceed_buf_limit(sk, prot, allocated);
- /* Alas. Undo changes. */
- sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
-
sk_memory_allocated_sub(sk, amt);
if (mem_cgroup_sockets_enabled && sk->sk_memcg)
@@ -2181,18 +2195,40 @@ suppress_allocation:
return 0;
}
+EXPORT_SYMBOL(__sk_mem_raise_allocated);
+
+/**
+ * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ * @sk: socket
+ * @size: memory size to allocate
+ * @kind: allocation type
+ *
+ * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
+ * rmem allocation. This function assumes that protocols which have
+ * memory_pressure use sk_wmem_queued as write buffer accounting.
+ */
+int __sk_mem_schedule(struct sock *sk, int size, int kind)
+{
+ int ret, amt = sk_mem_pages(size);
+
+ sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
+ ret = __sk_mem_raise_allocated(sk, size, amt, kind);
+ if (!ret)
+ sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
+ return ret;
+}
EXPORT_SYMBOL(__sk_mem_schedule);
/**
- * __sk_mem_reclaim - reclaim memory_allocated
+ * __sk_mem_reduce_allocated - reclaim memory_allocated
* @sk: socket
- * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
+ * @amount: number of quanta
+ *
+ * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
*/
-void __sk_mem_reclaim(struct sock *sk, int amount)
+void __sk_mem_reduce_allocated(struct sock *sk, int amount)
{
- amount >>= SK_MEM_QUANTUM_SHIFT;
sk_memory_allocated_sub(sk, amount);
- sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
if (mem_cgroup_sockets_enabled && sk->sk_memcg)
mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
@@ -2201,6 +2237,19 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
sk_leave_memory_pressure(sk);
}
+EXPORT_SYMBOL(__sk_mem_reduce_allocated);
+
+/**
+ * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
+ * @sk: socket
+ * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
+ */
+void __sk_mem_reclaim(struct sock *sk, int amount)
+{
+ amount >>= SK_MEM_QUANTUM_SHIFT;
+ sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+ __sk_mem_reduce_allocated(sk, amount);
+}
EXPORT_SYMBOL(__sk_mem_reclaim);
int sk_set_peek_off(struct sock *sk, int val)
@@ -2239,7 +2288,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
}
EXPORT_SYMBOL(sock_no_socketpair);
-int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
+int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern)
{
return -EOPNOTSUPP;
}
@@ -2436,11 +2486,21 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_type = sock->type;
sk->sk_wq = sock->wq;
sock->sk = sk;
- } else
+ sk->sk_uid = SOCK_INODE(sock)->i_uid;
+ } else {
sk->sk_wq = NULL;
+ sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
+ }
rwlock_init(&sk->sk_callback_lock);
- lockdep_set_class_and_name(&sk->sk_callback_lock,
+ if (sk->sk_kern_sock)
+ lockdep_set_class_and_name(
+ &sk->sk_callback_lock,
+ af_kern_callback_keys + sk->sk_family,
+ af_family_kern_clock_key_strings[sk->sk_family]);
+ else
+ lockdep_set_class_and_name(
+ &sk->sk_callback_lock,
af_callback_keys + sk->sk_family,
af_family_clock_key_strings[sk->sk_family]);
@@ -2738,11 +2798,6 @@ void sk_common_release(struct sock *sk)
sk_refcnt_debug_release(sk);
- if (sk->sk_frag.page) {
- put_page(sk->sk_frag.page);
- sk->sk_frag.page = NULL;
- }
-
sock_put(sk);
}
EXPORT_SYMBOL(sk_common_release);
diff --git a/net/core/stream.c b/net/core/stream.c
index 1086c8b280a8..20231dbb1da0 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -13,6 +13,7 @@
*/
#include <linux/module.h>
+#include <linux/sched/signal.h>
#include <linux/net.h>
#include <linux/signal.h>
#include <linux/tcp.h>
@@ -53,8 +54,8 @@ void sk_stream_write_space(struct sock *sk)
*/
int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct task_struct *tsk = current;
- DEFINE_WAIT(wait);
int done;
do {
@@ -68,13 +69,13 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
if (signal_pending(tsk))
return sock_intr_errno(*timeo_p);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending++;
done = sk_wait_event(sk, timeo_p,
!sk->sk_err &&
!((1 << sk->sk_state) &
- ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
- finish_wait(sk_sleep(sk), &wait);
+ ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending--;
} while (!done);
return 0;
@@ -94,16 +95,16 @@ static inline int sk_stream_closing(struct sock *sk)
void sk_stream_wait_close(struct sock *sk, long timeout)
{
if (timeout) {
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ add_wait_queue(sk_sleep(sk), &wait);
do {
- prepare_to_wait(sk_sleep(sk), &wait,
- TASK_INTERRUPTIBLE);
- if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk)))
+ if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk), &wait))
break;
} while (!signal_pending(current) && timeout);
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
}
}
EXPORT_SYMBOL(sk_stream_wait_close);
@@ -119,16 +120,16 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
long vm_wait = 0;
long current_timeo = *timeo_p;
bool noblock = (*timeo_p ? false : true);
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
if (sk_stream_memory_free(sk))
current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
+ add_wait_queue(sk_sleep(sk), &wait);
+
while (1) {
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
-
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto do_error;
if (!*timeo_p) {
@@ -147,7 +148,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
sk_wait_event(sk, &current_timeo, sk->sk_err ||
(sk->sk_shutdown & SEND_SHUTDOWN) ||
(sk_stream_memory_free(sk) &&
- !vm_wait));
+ !vm_wait), &wait);
sk->sk_write_pending--;
if (vm_wait) {
@@ -161,7 +162,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
*timeo_p = current_timeo;
}
out:
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
return err;
do_error:
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 0df2aa652530..7f9cc400eca0 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -79,10 +79,13 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
if (sock_table != orig_sock_table) {
rcu_assign_pointer(rps_sock_flow_table, sock_table);
- if (sock_table)
+ if (sock_table) {
static_key_slow_inc(&rps_needed);
+ static_key_slow_inc(&rfs_needed);
+ }
if (orig_sock_table) {
static_key_slow_dec(&rps_needed);
+ static_key_slow_dec(&rfs_needed);
synchronize_rcu();
vfree(orig_sock_table);
}
@@ -219,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int write,
}
#endif
+static int proc_do_dev_weight(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (ret != 0)
+ return ret;
+
+ dev_rx_weight = weight_p * dev_weight_rx_bias;
+ dev_tx_weight = weight_p * dev_weight_tx_bias;
+
+ return ret;
+}
+
static int proc_do_rss_key(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -270,7 +288,21 @@ static struct ctl_table net_core_table[] = {
.data = &weight_p,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_do_dev_weight,
+ },
+ {
+ .procname = "dev_weight_rx_bias",
+ .data = &dev_weight_rx_bias,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_dev_weight,
+ },
+ {
+ .procname = "dev_weight_tx_bias",
+ .data = &dev_weight_tx_bias,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_dev_weight,
},
{
.procname = "netdev_max_backlog",
@@ -302,6 +334,13 @@ static struct ctl_table net_core_table[] = {
.mode = 0600,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "bpf_jit_kallsyms",
+ .data = &bpf_jit_kallsyms,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec,
+ },
# endif
#endif
{
@@ -369,14 +408,16 @@ static struct ctl_table net_core_table[] = {
.data = &sysctl_net_busy_poll,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
},
{
.procname = "busy_read",
.data = &sysctl_net_busy_read,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
},
#endif
#ifdef CONFIG_NET_SCHED
diff --git a/net/core/utils.c b/net/core/utils.c
index cf5622b9ccc4..6592d7bbed39 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -31,7 +31,7 @@
#include <net/net_ratelimit.h>
#include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10);
/*
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index f053198e730c..5e3a7302f774 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -749,6 +749,7 @@ static void ccid2_hc_tx_exit(struct sock *sk)
for (i = 0; i < hc->tx_seqbufc; i++)
kfree(hc->tx_seqbuf[i]);
hc->tx_seqbufc = 0;
+ dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
}
static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
diff --git a/net/dccp/input.c b/net/dccp/input.c
index ba347184bda9..4a05d7876850 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -577,6 +577,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct dccp_sock *dp = dccp_sk(sk);
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
const int old_state = sk->sk_state;
+ bool acceptable;
int queued = 0;
/*
@@ -603,10 +604,16 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
*/
if (sk->sk_state == DCCP_LISTEN) {
if (dh->dccph_type == DCCP_PKT_REQUEST) {
- if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
- skb) < 0)
+ /* It is possible that we process SYN packets from backlog,
+ * so we need to make sure to disable BH right there.
+ */
+ local_bh_disable();
+ acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0;
+ local_bh_enable();
+ if (!acceptable)
return 1;
- goto discard;
+ consume_skb(skb);
+ return 0;
}
if (dh->dccph_type == DCCP_PKT_RESET)
goto discard;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index edbe59d203ef..b99168b0fabf 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -289,7 +289,8 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
switch (type) {
case ICMP_REDIRECT:
- dccp_do_redirect(skb, sk);
+ if (!sock_owned_by_user(sk))
+ dccp_do_redirect(skb, sk);
goto out;
case ICMP_SOURCE_QUENCH:
/* Just silently ignore these. */
@@ -590,13 +591,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (inet_csk_reqsk_queue_is_full(sk))
goto drop;
- /*
- * Accept backlog is full. If we have already queued enough
- * of warm entries in syn queue, drop request. It is better than
- * clogging syn queue with openreqs with exponentially increasing
- * timeout.
- */
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+ if (sk_acceptq_is_full(sk))
goto drop;
req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true);
@@ -910,7 +905,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
.getsockopt = ip_getsockopt,
.addr2sockaddr = inet_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in),
- .bind_conflict = inet_csk_bind_conflict,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ip_setsockopt,
.compat_getsockopt = compat_ip_getsockopt,
@@ -1024,9 +1018,15 @@ static void __net_exit dccp_v4_exit_net(struct net *net)
inet_ctl_sock_destroy(net->dccp.v4_ctl_sk);
}
+static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list)
+{
+ inet_twsk_purge(&dccp_hashinfo, AF_INET);
+}
+
static struct pernet_operations dccp_v4_ops = {
.init = dccp_v4_init_net,
.exit = dccp_v4_exit_net,
+ .exit_batch = dccp_v4_exit_batch,
};
static int __init dccp_v4_init(void)
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 715e5d1dc107..d9b6a4e403e7 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -122,10 +122,12 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
np = inet6_sk(sk);
if (type == NDISC_REDIRECT) {
- struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
+ if (!sock_owned_by_user(sk)) {
+ struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
- if (dst)
- dst->ops->redirect(dst, sk, skb);
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
+ }
goto out;
}
@@ -227,7 +229,7 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
opt = ireq->ipv6_opt;
if (!opt)
opt = rcu_dereference(np->opt);
- err = ip6_xmit(sk, skb, &fl6, opt, np->tclass);
+ err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -281,7 +283,7 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
if (!IS_ERR(dst)) {
skb_dst_set(skb, dst);
- ip6_xmit(ctl_sk, skb, &fl6, NULL, 0);
+ ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0);
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
return;
@@ -326,7 +328,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (inet_csk_reqsk_queue_is_full(sk))
goto drop;
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+ if (sk_acceptq_is_full(sk))
goto drop;
req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true);
@@ -937,7 +939,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
.getsockopt = ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in6),
- .bind_conflict = inet6_csk_bind_conflict,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,
@@ -958,7 +959,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
.getsockopt = ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in6),
- .bind_conflict = inet6_csk_bind_conflict,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,
@@ -1077,9 +1077,15 @@ static void __net_exit dccp_v6_exit_net(struct net *net)
inet_ctl_sock_destroy(net->dccp.v6_ctl_sk);
}
+static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list)
+{
+ inet_twsk_purge(&dccp_hashinfo, AF_INET6);
+}
+
static struct pernet_operations dccp_v6_ops = {
.init = dccp_v6_init_net,
.exit = dccp_v6_exit_net,
+ .exit_batch = dccp_v6_exit_batch,
};
static int __init dccp_v6_init(void)
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 53eddf99e4f6..abd07a443219 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -119,10 +119,7 @@ struct sock *dccp_create_openreq_child(const struct sock *sk,
* Activate features: initialise CCIDs, sequence windows etc.
*/
if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
- /* It is still raw copy of parent, so invalidate
- * destructor and make plain sk_free() */
- newsk->sk_destruct = NULL;
- sk_free(newsk);
+ sk_free_unlock_clone(newsk);
return NULL;
}
dccp_init_xmit_timers(newsk);
@@ -145,6 +142,13 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
struct dccp_request_sock *dreq = dccp_rsk(req);
bool own_req;
+ /* TCP/DCCP listeners became lockless.
+ * DCCP stores complex state in its request_sock, so we need
+ * a protection for them, now this code runs without being protected
+ * by the parent (listener) lock.
+ */
+ spin_lock_bh(&dreq->dreq_lock);
+
/* Check for retransmitted REQUEST */
if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
@@ -159,7 +163,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
inet_rtx_syn_ack(sk, req);
}
/* Network Duplicate, discard packet */
- return NULL;
+ goto out;
}
DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
@@ -185,20 +189,20 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
req, &own_req);
- if (!child)
- goto listen_overflow;
-
- return inet_csk_complete_hashdance(sk, child, req, own_req);
+ if (child) {
+ child = inet_csk_complete_hashdance(sk, child, req, own_req);
+ goto out;
+ }
-listen_overflow:
- dccp_pr_debug("listen_overflow!\n");
DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
drop:
if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
req->rsk_ops->send_reset(sk, skb);
inet_csk_reqsk_queue_drop(sk, req);
- return NULL;
+out:
+ spin_unlock_bh(&dreq->dreq_lock);
+ return child;
}
EXPORT_SYMBOL_GPL(dccp_check_req);
@@ -249,6 +253,7 @@ int dccp_reqsk_init(struct request_sock *req,
{
struct dccp_request_sock *dreq = dccp_rsk(req);
+ spin_lock_init(&dreq->dreq_lock);
inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport;
inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport);
inet_rsk(req)->acked = 0;
diff --git a/net/dccp/output.c b/net/dccp/output.c
index b66c84db0766..91a15b3c4915 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -14,6 +14,7 @@
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
+#include <linux/sched/signal.h>
#include <net/inet_sock.h>
#include <net/sock.h>
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 13d6b1a6e0fc..7de5b40a5d0d 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -106,7 +106,7 @@ Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
@@ -1070,7 +1070,8 @@ static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo)
return skb == NULL ? ERR_PTR(err) : skb;
}
-static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
+static int dn_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern)
{
struct sock *sk = sock->sk, *newsk;
struct sk_buff *skb = NULL;
@@ -1099,7 +1100,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
cb = DN_SKB_CB(skb);
sk->sk_ack_backlog--;
- newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, 0);
+ newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, kern);
if (newsk == NULL) {
release_sock(sk);
kfree_skb(skb);
@@ -1718,7 +1719,7 @@ static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
* See if there is data ready to read, sleep if there isn't
*/
for(;;) {
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
if (sk->sk_err)
goto out;
@@ -1749,11 +1750,11 @@ static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
goto out;
}
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target));
+ sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target), &wait);
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
}
skb_queue_walk_safe(queue, skb, n) {
@@ -1999,19 +2000,19 @@ static int dn_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
* size.
*/
if (dn_queue_too_long(scp, queue, flags)) {
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
if (flags & MSG_DONTWAIT) {
err = -EWOULDBLOCK;
goto out;
}
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
sk_wait_event(sk, &timeo,
- !dn_queue_too_long(scp, queue, flags));
+ !dn_queue_too_long(scp, queue, flags), &wait);
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
continue;
}
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index b2c26b081134..8fdd9f492b0e 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -42,7 +42,7 @@
#include <linux/notifier.h>
#include <linux/slab.h>
#include <linux/jiffies.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/net_namespace.h>
#include <net/neighbour.h>
#include <net/dst.h>
@@ -201,7 +201,7 @@ static struct dn_dev_sysctl_table {
.extra1 = &min_t3,
.extra2 = &max_t3
},
- {0}
+ { }
},
};
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index a796fc7cbc35..7af0ba6157a1 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -31,7 +31,7 @@
#include <linux/timer.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/neighbour.h>
#include <net/dst.h>
#include <net/flow.h>
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 1540b506e3e0..232675480756 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -25,7 +25,7 @@
#include <linux/timer.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/route.h> /* RTF_xxx */
#include <net/neighbour.h>
#include <net/netlink.h>
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 5325b541c526..6c7da6c29bf0 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -22,7 +22,7 @@
#include <net/dst.h>
#include <net/flow.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/dn.h>
#include <net/dn_dev.h>
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index ecc28cff08ab..af781010753b 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -37,8 +37,10 @@
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/cred.h>
#include <linux/dns_resolver.h>
#include <linux/err.h>
+
#include <keys/dns_resolver-type.h>
#include <keys/user-type.h>
@@ -70,7 +72,7 @@ int dns_query(const char *type, const char *name, size_t namelen,
const char *options, char **_result, time64_t *_expiry)
{
struct key *rkey;
- const struct user_key_payload *upayload;
+ struct user_key_payload *upayload;
const struct cred *saved_cred;
size_t typelen, desclen;
char *desc, *cp;
@@ -141,7 +143,7 @@ int dns_query(const char *type, const char *name, size_t namelen,
if (ret)
goto put;
- upayload = user_key_payload(rkey);
+ upayload = user_key_payload_locked(rkey);
len = upayload->datalen;
ret = -ENOMEM;
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 96e47c539bee..9649238eef40 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -1,12 +1,13 @@
config HAVE_NET_DSA
def_bool y
- depends on NETDEVICES && !S390
+ depends on INET && NETDEVICES && !S390
# Drivers must select NET_DSA and the appropriate tagging format
config NET_DSA
tristate "Distributed Switch Architecture"
- depends on HAVE_NET_DSA && NET_SWITCHDEV
+ depends on HAVE_NET_DSA
+ select NET_SWITCHDEV
select PHYLIB
---help---
Say Y if you want to enable support for the hardware switches supported
@@ -14,17 +15,6 @@ config NET_DSA
if NET_DSA
-config NET_DSA_HWMON
- bool "Distributed Switch Architecture HWMON support"
- default y
- depends on HWMON && !(NET_DSA=y && HWMON=m)
- ---help---
- Say Y if you want to expose thermal sensor data on switches supported
- by the Distributed Switch Architecture.
-
- Some of those switches contain thermal sensors. This data is available
- via the hwmon sysfs interface and exposes the onboard sensors.
-
# tagging formats
config NET_DSA_TAG_BRCM
bool
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index a3380ed0e0be..31d343796251 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,6 +1,6 @@
# the core
obj-$(CONFIG_NET_DSA) += dsa_core.o
-dsa_core-y += dsa.o slave.o dsa2.o
+dsa_core-y += dsa.o slave.o dsa2.o switch.o
# tagging formats
dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 7899919cd9f0..b6d4f6a23f06 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -9,9 +9,7 @@
* (at your option) any later version.
*/
-#include <linux/ctype.h>
#include <linux/device.h>
-#include <linux/hwmon.h>
#include <linux/list.h>
#include <linux/platform_device.h>
#include <linux/slab.h>
@@ -27,8 +25,6 @@
#include <linux/gpio/consumer.h>
#include "dsa_priv.h"
-char dsa_driver_version[] = "0.1";
-
static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
struct net_device *dev)
{
@@ -64,27 +60,27 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
static DEFINE_MUTEX(dsa_switch_drivers_mutex);
static LIST_HEAD(dsa_switch_drivers);
-void register_switch_driver(struct dsa_switch_ops *ops)
+void register_switch_driver(struct dsa_switch_driver *drv)
{
mutex_lock(&dsa_switch_drivers_mutex);
- list_add_tail(&ops->list, &dsa_switch_drivers);
+ list_add_tail(&drv->list, &dsa_switch_drivers);
mutex_unlock(&dsa_switch_drivers_mutex);
}
EXPORT_SYMBOL_GPL(register_switch_driver);
-void unregister_switch_driver(struct dsa_switch_ops *ops)
+void unregister_switch_driver(struct dsa_switch_driver *drv)
{
mutex_lock(&dsa_switch_drivers_mutex);
- list_del_init(&ops->list);
+ list_del_init(&drv->list);
mutex_unlock(&dsa_switch_drivers_mutex);
}
EXPORT_SYMBOL_GPL(unregister_switch_driver);
-static struct dsa_switch_ops *
+static const struct dsa_switch_ops *
dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
const char **_name, void **priv)
{
- struct dsa_switch_ops *ret;
+ const struct dsa_switch_ops *ret;
struct list_head *list;
const char *name;
@@ -93,9 +89,11 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
mutex_lock(&dsa_switch_drivers_mutex);
list_for_each(list, &dsa_switch_drivers) {
- struct dsa_switch_ops *ops;
+ const struct dsa_switch_ops *ops;
+ struct dsa_switch_driver *drv;
- ops = list_entry(list, struct dsa_switch_ops, list);
+ drv = list_entry(list, struct dsa_switch_driver, list);
+ ops = drv->ops;
name = ops->probe(parent, host_dev, sw_addr, priv);
if (name != NULL) {
@@ -110,109 +108,11 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
return ret;
}
-/* hwmon support ************************************************************/
-
-#ifdef CONFIG_NET_DSA_HWMON
-
-static ssize_t temp1_input_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct dsa_switch *ds = dev_get_drvdata(dev);
- int temp, ret;
-
- ret = ds->ops->get_temp(ds, &temp);
- if (ret < 0)
- return ret;
-
- return sprintf(buf, "%d\n", temp * 1000);
-}
-static DEVICE_ATTR_RO(temp1_input);
-
-static ssize_t temp1_max_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct dsa_switch *ds = dev_get_drvdata(dev);
- int temp, ret;
-
- ret = ds->ops->get_temp_limit(ds, &temp);
- if (ret < 0)
- return ret;
-
- return sprintf(buf, "%d\n", temp * 1000);
-}
-
-static ssize_t temp1_max_store(struct device *dev,
- struct device_attribute *attr, const char *buf,
- size_t count)
-{
- struct dsa_switch *ds = dev_get_drvdata(dev);
- int temp, ret;
-
- ret = kstrtoint(buf, 0, &temp);
- if (ret < 0)
- return ret;
-
- ret = ds->ops->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000));
- if (ret < 0)
- return ret;
-
- return count;
-}
-static DEVICE_ATTR_RW(temp1_max);
-
-static ssize_t temp1_max_alarm_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct dsa_switch *ds = dev_get_drvdata(dev);
- bool alarm;
- int ret;
-
- ret = ds->ops->get_temp_alarm(ds, &alarm);
- if (ret < 0)
- return ret;
-
- return sprintf(buf, "%d\n", alarm);
-}
-static DEVICE_ATTR_RO(temp1_max_alarm);
-
-static struct attribute *dsa_hwmon_attrs[] = {
- &dev_attr_temp1_input.attr, /* 0 */
- &dev_attr_temp1_max.attr, /* 1 */
- &dev_attr_temp1_max_alarm.attr, /* 2 */
- NULL
-};
-
-static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj,
- struct attribute *attr, int index)
-{
- struct device *dev = container_of(kobj, struct device, kobj);
- struct dsa_switch *ds = dev_get_drvdata(dev);
- struct dsa_switch_ops *ops = ds->ops;
- umode_t mode = attr->mode;
-
- if (index == 1) {
- if (!ops->get_temp_limit)
- mode = 0;
- else if (!ops->set_temp_limit)
- mode &= ~S_IWUSR;
- } else if (index == 2 && !ops->get_temp_alarm) {
- mode = 0;
- }
- return mode;
-}
-
-static const struct attribute_group dsa_hwmon_group = {
- .attrs = dsa_hwmon_attrs,
- .is_visible = dsa_hwmon_attrs_visible,
-};
-__ATTRIBUTE_GROUPS(dsa_hwmon);
-
-#endif /* CONFIG_NET_DSA_HWMON */
-
/* basic switch operations **************************************************/
int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
- struct device_node *port_dn, int port)
+ struct dsa_port *dport, int port)
{
+ struct device_node *port_dn = dport->dn;
struct phy_device *phydev;
int ret, mode;
@@ -242,15 +142,15 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
static int dsa_cpu_dsa_setups(struct dsa_switch *ds, struct device *dev)
{
- struct device_node *port_dn;
+ struct dsa_port *dport;
int ret, port;
- for (port = 0; port < DSA_MAX_PORTS; port++) {
+ for (port = 0; port < ds->num_ports; port++) {
if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
continue;
- port_dn = ds->ports[port].dn;
- ret = dsa_cpu_dsa_setup(ds, dev, port_dn, port);
+ dport = &ds->ports[port];
+ ret = dsa_cpu_dsa_setup(ds, dev, dport, port);
if (ret)
return ret;
}
@@ -308,7 +208,7 @@ void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds)
static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
{
- struct dsa_switch_ops *ops = ds->ops;
+ const struct dsa_switch_ops *ops = ds->ops;
struct dsa_switch_tree *dst = ds->dst;
struct dsa_chip_data *cd = ds->cd;
bool valid_name_found = false;
@@ -318,7 +218,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
/*
* Validate supplied switch configuration.
*/
- for (i = 0; i < DSA_MAX_PORTS; i++) {
+ for (i = 0; i < ds->num_ports; i++) {
char *name;
name = cd->port_names[i];
@@ -326,13 +226,12 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
continue;
if (!strcmp(name, "cpu")) {
- if (dst->cpu_switch != -1) {
+ if (dst->cpu_switch) {
netdev_err(dst->master_netdev,
"multiple cpu ports?!\n");
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
- dst->cpu_switch = index;
+ dst->cpu_switch = ds;
dst->cpu_port = i;
ds->cpu_port_mask |= 1 << i;
} else if (!strcmp(name, "dsa")) {
@@ -343,10 +242,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
valid_name_found = true;
}
- if (!valid_name_found && i == DSA_MAX_PORTS) {
- ret = -EINVAL;
- goto out;
- }
+ if (!valid_name_found && i == ds->num_ports)
+ return -EINVAL;
/* Make the built-in MII bus mask match the number of ports,
* switch drivers can override this later
@@ -358,15 +255,13 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
* tagging protocol to the preferred tagging format of this
* switch.
*/
- if (dst->cpu_switch == index) {
+ if (dst->cpu_switch == ds) {
enum dsa_tag_protocol tag_protocol;
tag_protocol = ops->get_tag_protocol(ds);
dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol);
- if (IS_ERR(dst->tag_ops)) {
- ret = PTR_ERR(dst->tag_ops);
- goto out;
- }
+ if (IS_ERR(dst->tag_ops))
+ return PTR_ERR(dst->tag_ops);
dst->rcv = dst->tag_ops->rcv;
}
@@ -378,85 +273,55 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
*/
ret = ops->setup(ds);
if (ret < 0)
- goto out;
+ return ret;
+
+ ret = dsa_switch_register_notifier(ds);
+ if (ret)
+ return ret;
if (ops->set_addr) {
ret = ops->set_addr(ds, dst->master_netdev->dev_addr);
if (ret < 0)
- goto out;
+ return ret;
}
if (!ds->slave_mii_bus && ops->phy_read) {
ds->slave_mii_bus = devm_mdiobus_alloc(parent);
- if (!ds->slave_mii_bus) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!ds->slave_mii_bus)
+ return -ENOMEM;
dsa_slave_mii_bus_init(ds);
ret = mdiobus_register(ds->slave_mii_bus);
if (ret < 0)
- goto out;
+ return ret;
}
/*
* Create network devices for physical switch ports.
*/
- for (i = 0; i < DSA_MAX_PORTS; i++) {
+ for (i = 0; i < ds->num_ports; i++) {
ds->ports[i].dn = cd->port_dn[i];
if (!(ds->enabled_port_mask & (1 << i)))
continue;
ret = dsa_slave_create(ds, parent, i, cd->port_names[i]);
- if (ret < 0) {
+ if (ret < 0)
netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n",
index, i, cd->port_names[i], ret);
- ret = 0;
- }
}
/* Perform configuration of the CPU and DSA ports */
ret = dsa_cpu_dsa_setups(ds, parent);
- if (ret < 0) {
+ if (ret < 0)
netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n",
index);
- ret = 0;
- }
ret = dsa_cpu_port_ethtool_setup(ds);
if (ret)
return ret;
-#ifdef CONFIG_NET_DSA_HWMON
- /* If the switch provides a temperature sensor,
- * register with hardware monitoring subsystem.
- * Treat registration error as non-fatal and ignore it.
- */
- if (ops->get_temp) {
- const char *netname = netdev_name(dst->master_netdev);
- char hname[IFNAMSIZ + 1];
- int i, j;
-
- /* Create valid hwmon 'name' attribute */
- for (i = j = 0; i < IFNAMSIZ && netname[i]; i++) {
- if (isalnum(netname[i]))
- hname[j++] = netname[i];
- }
- hname[j] = '\0';
- scnprintf(ds->hwmon_name, sizeof(ds->hwmon_name), "%s_dsa%d",
- hname, index);
- ds->hwmon_dev = hwmon_device_register_with_groups(NULL,
- ds->hwmon_name, ds, dsa_hwmon_groups);
- if (IS_ERR(ds->hwmon_dev))
- ds->hwmon_dev = NULL;
- }
-#endif /* CONFIG_NET_DSA_HWMON */
-
- return ret;
-
-out:
- return ret;
+ return 0;
}
static struct dsa_switch *
@@ -464,7 +329,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
struct device *parent, struct device *host_dev)
{
struct dsa_chip_data *cd = dst->pd->chip + index;
- struct dsa_switch_ops *ops;
+ const struct dsa_switch_ops *ops;
struct dsa_switch *ds;
int ret;
const char *name;
@@ -486,8 +351,8 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
/*
* Allocate and initialise switch state.
*/
- ds = devm_kzalloc(parent, sizeof(*ds), GFP_KERNEL);
- if (ds == NULL)
+ ds = dsa_switch_alloc(parent, DSA_MAX_PORTS);
+ if (!ds)
return ERR_PTR(-ENOMEM);
ds->dst = dst;
@@ -495,7 +360,6 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
ds->cd = cd;
ds->ops = ops;
ds->priv = priv;
- ds->dev = parent;
ret = dsa_switch_setup_one(ds, parent);
if (ret)
@@ -504,8 +368,10 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
return ds;
}
-void dsa_cpu_dsa_destroy(struct device_node *port_dn)
+void dsa_cpu_dsa_destroy(struct dsa_port *port)
{
+ struct device_node *port_dn = port->dn;
+
if (of_phy_is_fixed_link(port_dn))
of_phy_deregister_fixed_link(port_dn);
}
@@ -514,13 +380,8 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
{
int port;
-#ifdef CONFIG_NET_DSA_HWMON
- if (ds->hwmon_dev)
- hwmon_device_unregister(ds->hwmon_dev);
-#endif
-
/* Destroy network devices for physical switch ports. */
- for (port = 0; port < DSA_MAX_PORTS; port++) {
+ for (port = 0; port < ds->num_ports; port++) {
if (!(ds->enabled_port_mask & (1 << port)))
continue;
@@ -531,10 +392,10 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
}
/* Disable configuration of the CPU and DSA ports */
- for (port = 0; port < DSA_MAX_PORTS; port++) {
+ for (port = 0; port < ds->num_ports; port++) {
if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
continue;
- dsa_cpu_dsa_destroy(ds->ports[port].dn);
+ dsa_cpu_dsa_destroy(&ds->ports[port]);
/* Clearing a bit which is not set does no harm */
ds->cpu_port_mask |= ~(1 << port);
@@ -543,6 +404,8 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
if (ds->slave_mii_bus && ds->ops->phy_read)
mdiobus_unregister(ds->slave_mii_bus);
+
+ dsa_switch_unregister_notifier(ds);
}
#ifdef CONFIG_PM_SLEEP
@@ -551,7 +414,7 @@ int dsa_switch_suspend(struct dsa_switch *ds)
int i, ret = 0;
/* Suspend slave network devices */
- for (i = 0; i < DSA_MAX_PORTS; i++) {
+ for (i = 0; i < ds->num_ports; i++) {
if (!dsa_is_port_initialized(ds, i))
continue;
@@ -578,7 +441,7 @@ int dsa_switch_resume(struct dsa_switch *ds)
return ret;
/* Resume slave network devices */
- for (i = 0; i < DSA_MAX_PORTS; i++) {
+ for (i = 0; i < ds->num_ports; i++) {
if (!dsa_is_port_initialized(ds, i))
continue;
@@ -629,7 +492,7 @@ struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev)
}
EXPORT_SYMBOL_GPL(dsa_host_dev_to_mii_bus);
-static struct net_device *dev_to_net_device(struct device *dev)
+struct net_device *dsa_dev_to_net_device(struct device *dev)
{
struct device *d;
@@ -646,6 +509,7 @@ static struct net_device *dev_to_net_device(struct device *dev)
return NULL;
}
+EXPORT_SYMBOL_GPL(dsa_dev_to_net_device);
#ifdef CONFIG_OF
static int dsa_of_setup_routing_table(struct dsa_platform_data *pd,
@@ -898,7 +762,6 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,
dst->pd = pd;
dst->master_netdev = dev;
- dst->cpu_switch = -1;
dst->cpu_port = -1;
for (i = 0; i < pd->nr_chips; i++) {
@@ -940,9 +803,6 @@ static int dsa_probe(struct platform_device *pdev)
struct dsa_switch_tree *dst;
int ret;
- pr_notice_once("Distributed Switch Architecture driver version %s\n",
- dsa_driver_version);
-
if (pdev->dev.of_node) {
ret = dsa_of_probe(&pdev->dev);
if (ret)
@@ -958,7 +818,7 @@ static int dsa_probe(struct platform_device *pdev)
dev = pd->of_netdev;
dev_hold(dev);
} else {
- dev = dev_to_net_device(pd->netdev);
+ dev = dsa_dev_to_net_device(pd->netdev);
}
if (dev == NULL) {
ret = -EPROBE_DEFER;
@@ -1013,7 +873,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
dsa_switch_destroy(ds);
}
- dsa_cpu_port_ethtool_restore(dst->ds[0]);
+ dsa_cpu_port_ethtool_restore(dst->cpu_switch);
dev_put(dst->master_netdev);
}
@@ -1050,10 +910,6 @@ static struct packet_type dsa_pack_type __read_mostly = {
.func = dsa_switch_rcv,
};
-static struct notifier_block dsa_netdevice_nb __read_mostly = {
- .notifier_call = dsa_slave_netdevice_event,
-};
-
#ifdef CONFIG_PM_SLEEP
static int dsa_suspend(struct device *d)
{
@@ -1111,7 +967,9 @@ static int __init dsa_init_module(void)
{
int rc;
- register_netdevice_notifier(&dsa_netdevice_nb);
+ rc = dsa_slave_register_notifier();
+ if (rc)
+ return rc;
rc = platform_driver_register(&dsa_driver);
if (rc)
@@ -1125,7 +983,7 @@ module_init(dsa_init_module);
static void __exit dsa_cleanup_module(void)
{
- unregister_netdevice_notifier(&dsa_netdevice_nb);
+ dsa_slave_unregister_notifier();
dev_remove_pack(&dsa_pack_type);
platform_driver_unregister(&dsa_driver);
}
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 5fff951a0a49..737be6470c7f 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -57,7 +57,6 @@ static struct dsa_switch_tree *dsa_add_dst(u32 tree)
if (!dst)
return NULL;
dst->tree = tree;
- dst->cpu_switch = -1;
INIT_LIST_HEAD(&dst->list);
list_add_tail(&dsa_switch_trees, &dst->list);
kref_init(&dst->refcount);
@@ -79,47 +78,43 @@ static void dsa_dst_del_ds(struct dsa_switch_tree *dst,
kref_put(&dst->refcount, dsa_free_dst);
}
-static bool dsa_port_is_dsa(struct device_node *port)
+/* For platform data configurations, we need to have a valid name argument to
+ * differentiate a disabled port from an enabled one
+ */
+static bool dsa_port_is_valid(struct dsa_port *port)
{
- const char *name;
-
- name = of_get_property(port, "label", NULL);
- if (!name)
- return false;
+ return !!(port->dn || port->name);
+}
- if (!strcmp(name, "dsa"))
+static bool dsa_port_is_dsa(struct dsa_port *port)
+{
+ if (port->name && !strcmp(port->name, "dsa"))
return true;
-
- return false;
+ else
+ return !!of_parse_phandle(port->dn, "link", 0);
}
-static bool dsa_port_is_cpu(struct device_node *port)
+static bool dsa_port_is_cpu(struct dsa_port *port)
{
- const char *name;
-
- name = of_get_property(port, "label", NULL);
- if (!name)
- return false;
-
- if (!strcmp(name, "cpu"))
+ if (port->name && !strcmp(port->name, "cpu"))
return true;
-
- return false;
+ else
+ return !!of_parse_phandle(port->dn, "ethernet", 0);
}
-static bool dsa_ds_find_port(struct dsa_switch *ds,
- struct device_node *port)
+static bool dsa_ds_find_port_dn(struct dsa_switch *ds,
+ struct device_node *port)
{
u32 index;
- for (index = 0; index < DSA_MAX_PORTS; index++)
+ for (index = 0; index < ds->num_ports; index++)
if (ds->ports[index].dn == port)
return true;
return false;
}
-static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst,
- struct device_node *port)
+static struct dsa_switch *dsa_dst_find_port_dn(struct dsa_switch_tree *dst,
+ struct device_node *port)
{
struct dsa_switch *ds;
u32 index;
@@ -129,7 +124,7 @@ static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst,
if (!ds)
continue;
- if (dsa_ds_find_port(ds, port))
+ if (dsa_ds_find_port_dn(ds, port))
return ds;
}
@@ -138,7 +133,7 @@ static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst,
static int dsa_port_complete(struct dsa_switch_tree *dst,
struct dsa_switch *src_ds,
- struct device_node *port,
+ struct dsa_port *port,
u32 src_port)
{
struct device_node *link;
@@ -146,11 +141,11 @@ static int dsa_port_complete(struct dsa_switch_tree *dst,
struct dsa_switch *dst_ds;
for (index = 0;; index++) {
- link = of_parse_phandle(port, "link", index);
+ link = of_parse_phandle(port->dn, "link", index);
if (!link)
break;
- dst_ds = dsa_dst_find_port(dst, link);
+ dst_ds = dsa_dst_find_port_dn(dst, link);
of_node_put(link);
if (!dst_ds)
@@ -169,13 +164,13 @@ static int dsa_port_complete(struct dsa_switch_tree *dst,
*/
static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds)
{
- struct device_node *port;
+ struct dsa_port *port;
u32 index;
int err;
- for (index = 0; index < DSA_MAX_PORTS; index++) {
- port = ds->ports[index].dn;
- if (!port)
+ for (index = 0; index < ds->num_ports; index++) {
+ port = &ds->ports[index];
+ if (!dsa_port_is_valid(port))
continue;
if (!dsa_port_is_dsa(port))
@@ -215,7 +210,7 @@ static int dsa_dst_complete(struct dsa_switch_tree *dst)
return 0;
}
-static int dsa_dsa_port_apply(struct device_node *port, u32 index,
+static int dsa_dsa_port_apply(struct dsa_port *port, u32 index,
struct dsa_switch *ds)
{
int err;
@@ -230,13 +225,13 @@ static int dsa_dsa_port_apply(struct device_node *port, u32 index,
return 0;
}
-static void dsa_dsa_port_unapply(struct device_node *port, u32 index,
+static void dsa_dsa_port_unapply(struct dsa_port *port, u32 index,
struct dsa_switch *ds)
{
dsa_cpu_dsa_destroy(port);
}
-static int dsa_cpu_port_apply(struct device_node *port, u32 index,
+static int dsa_cpu_port_apply(struct dsa_port *port, u32 index,
struct dsa_switch *ds)
{
int err;
@@ -253,7 +248,7 @@ static int dsa_cpu_port_apply(struct device_node *port, u32 index,
return 0;
}
-static void dsa_cpu_port_unapply(struct device_node *port, u32 index,
+static void dsa_cpu_port_unapply(struct dsa_port *port, u32 index,
struct dsa_switch *ds)
{
dsa_cpu_dsa_destroy(port);
@@ -261,25 +256,29 @@ static void dsa_cpu_port_unapply(struct device_node *port, u32 index,
}
-static int dsa_user_port_apply(struct device_node *port, u32 index,
+static int dsa_user_port_apply(struct dsa_port *port, u32 index,
struct dsa_switch *ds)
{
- const char *name;
+ const char *name = port->name;
int err;
- name = of_get_property(port, "label", NULL);
+ if (port->dn)
+ name = of_get_property(port->dn, "label", NULL);
+ if (!name)
+ name = "eth%d";
err = dsa_slave_create(ds, ds->dev, index, name);
if (err) {
dev_warn(ds->dev, "Failed to create slave %d: %d\n",
index, err);
+ ds->ports[index].netdev = NULL;
return err;
}
return 0;
}
-static void dsa_user_port_unapply(struct device_node *port, u32 index,
+static void dsa_user_port_unapply(struct dsa_port *port, u32 index,
struct dsa_switch *ds)
{
if (ds->ports[index].netdev) {
@@ -291,7 +290,7 @@ static void dsa_user_port_unapply(struct device_node *port, u32 index,
static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
{
- struct device_node *port;
+ struct dsa_port *port;
u32 index;
int err;
@@ -306,6 +305,10 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
if (err < 0)
return err;
+ err = dsa_switch_register_notifier(ds);
+ if (err)
+ return err;
+
if (ds->ops->set_addr) {
err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr);
if (err < 0)
@@ -324,9 +327,9 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
return err;
}
- for (index = 0; index < DSA_MAX_PORTS; index++) {
- port = ds->ports[index].dn;
- if (!port)
+ for (index = 0; index < ds->num_ports; index++) {
+ port = &ds->ports[index];
+ if (!dsa_port_is_valid(port))
continue;
if (dsa_port_is_dsa(port)) {
@@ -353,12 +356,12 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
{
- struct device_node *port;
+ struct dsa_port *port;
u32 index;
- for (index = 0; index < DSA_MAX_PORTS; index++) {
- port = ds->ports[index].dn;
- if (!port)
+ for (index = 0; index < ds->num_ports; index++) {
+ port = &ds->ports[index];
+ if (!dsa_port_is_valid(port))
continue;
if (dsa_port_is_dsa(port)) {
@@ -376,6 +379,8 @@ static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
if (ds->slave_mii_bus && ds->ops->phy_read)
mdiobus_unregister(ds->slave_mii_bus);
+
+ dsa_switch_unregister_notifier(ds);
}
static int dsa_dst_apply(struct dsa_switch_tree *dst)
@@ -394,9 +399,11 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst)
return err;
}
- err = dsa_cpu_port_ethtool_setup(dst->ds[0]);
- if (err)
- return err;
+ if (dst->cpu_switch) {
+ err = dsa_cpu_port_ethtool_setup(dst->cpu_switch);
+ if (err)
+ return err;
+ }
/* If we use a tagging format that doesn't have an ethertype
* field, make sure that all packets from this point on get
@@ -433,13 +440,14 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
dsa_ds_unapply(dst, ds);
}
- dsa_cpu_port_ethtool_restore(dst->ds[0]);
+ if (dst->cpu_switch)
+ dsa_cpu_port_ethtool_restore(dst->cpu_switch);
pr_info("DSA: tree %d unapplied\n", dst->tree);
dst->applied = false;
}
-static int dsa_cpu_parse(struct device_node *port, u32 index,
+static int dsa_cpu_parse(struct dsa_port *port, u32 index,
struct dsa_switch_tree *dst,
struct dsa_switch *ds)
{
@@ -447,11 +455,16 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
struct net_device *ethernet_dev;
struct device_node *ethernet;
- ethernet = of_parse_phandle(port, "ethernet", 0);
- if (!ethernet)
- return -EINVAL;
+ if (port->dn) {
+ ethernet = of_parse_phandle(port->dn, "ethernet", 0);
+ if (!ethernet)
+ return -EINVAL;
+ ethernet_dev = of_find_net_device_by_node(ethernet);
+ } else {
+ ethernet_dev = dsa_dev_to_net_device(ds->cd->netdev[index]);
+ dev_put(ethernet_dev);
+ }
- ethernet_dev = of_find_net_device_by_node(ethernet);
if (!ethernet_dev)
return -EPROBE_DEFER;
@@ -461,8 +474,8 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
if (!dst->master_netdev)
dst->master_netdev = ethernet_dev;
- if (dst->cpu_switch == -1) {
- dst->cpu_switch = ds->index;
+ if (!dst->cpu_switch) {
+ dst->cpu_switch = ds;
dst->cpu_port = index;
}
@@ -480,13 +493,13 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds)
{
- struct device_node *port;
+ struct dsa_port *port;
u32 index;
int err;
- for (index = 0; index < DSA_MAX_PORTS; index++) {
- port = ds->ports[index].dn;
- if (!port)
+ for (index = 0; index < ds->num_ports; index++) {
+ port = &ds->ports[index];
+ if (!dsa_port_is_valid(port))
continue;
if (dsa_port_is_cpu(port)) {
@@ -538,7 +551,7 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds)
if (err)
return err;
- if (reg >= DSA_MAX_PORTS)
+ if (reg >= ds->num_ports)
return -EINVAL;
ds->ports[reg].dn = port;
@@ -547,14 +560,41 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds)
* to have access to a correct value, just like what
* net/dsa/dsa.c::dsa_switch_setup_one does.
*/
- if (!dsa_port_is_cpu(port))
+ if (!dsa_port_is_cpu(&ds->ports[reg]))
ds->enabled_port_mask |= 1 << reg;
}
return 0;
}
-static int dsa_parse_member(struct device_node *np, u32 *tree, u32 *index)
+static int dsa_parse_ports(struct dsa_chip_data *cd, struct dsa_switch *ds)
+{
+ bool valid_name_found = false;
+ unsigned int i;
+
+ for (i = 0; i < DSA_MAX_PORTS; i++) {
+ if (!cd->port_names[i])
+ continue;
+
+ ds->ports[i].name = cd->port_names[i];
+
+ /* Initialize enabled_port_mask now for drv->setup()
+ * to have access to a correct value, just like what
+ * net/dsa/dsa.c::dsa_switch_setup_one does.
+ */
+ if (!dsa_port_is_cpu(&ds->ports[i]))
+ ds->enabled_port_mask |= 1 << i;
+
+ valid_name_found = true;
+ }
+
+ if (!valid_name_found && i == DSA_MAX_PORTS)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int dsa_parse_member_dn(struct device_node *np, u32 *tree, u32 *index)
{
int err;
@@ -578,6 +618,18 @@ static int dsa_parse_member(struct device_node *np, u32 *tree, u32 *index)
return 0;
}
+static int dsa_parse_member(struct dsa_chip_data *pd, u32 *tree, u32 *index)
+{
+ if (!pd)
+ return -ENODEV;
+
+ /* We do not support complex trees with dsa_chip_data */
+ *tree = 0;
+ *index = 0;
+
+ return 0;
+}
+
static struct device_node *dsa_get_ports(struct dsa_switch *ds,
struct device_node *np)
{
@@ -592,23 +644,36 @@ static struct device_node *dsa_get_ports(struct dsa_switch *ds,
return ports;
}
-static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
+static int _dsa_register_switch(struct dsa_switch *ds, struct device *dev)
{
- struct device_node *ports = dsa_get_ports(ds, np);
+ struct dsa_chip_data *pdata = dev->platform_data;
+ struct device_node *np = dev->of_node;
struct dsa_switch_tree *dst;
+ struct device_node *ports;
u32 tree, index;
int i, err;
- err = dsa_parse_member(np, &tree, &index);
- if (err)
- return err;
+ if (np) {
+ err = dsa_parse_member_dn(np, &tree, &index);
+ if (err)
+ return err;
- if (IS_ERR(ports))
- return PTR_ERR(ports);
+ ports = dsa_get_ports(ds, np);
+ if (IS_ERR(ports))
+ return PTR_ERR(ports);
- err = dsa_parse_ports_dn(ports, ds);
- if (err)
- return err;
+ err = dsa_parse_ports_dn(ports, ds);
+ if (err)
+ return err;
+ } else {
+ err = dsa_parse_member(pdata, &tree, &index);
+ if (err)
+ return err;
+
+ err = dsa_parse_ports(pdata, ds);
+ if (err)
+ return err;
+ }
dst = dsa_get_dst(tree);
if (!dst) {
@@ -624,6 +689,7 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
ds->dst = dst;
ds->index = index;
+ ds->cd = pdata;
/* Initialize the routing table */
for (i = 0; i < DSA_MAX_SWITCHES; ++i)
@@ -647,8 +713,14 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
}
err = dsa_dst_parse(dst);
- if (err)
+ if (err) {
+ if (err == -EPROBE_DEFER) {
+ dsa_dst_del_ds(dst, ds, ds->index);
+ return err;
+ }
+
goto out_del_dst;
+ }
err = dsa_dst_apply(dst);
if (err) {
@@ -667,12 +739,34 @@ out:
return err;
}
-int dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
+struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
+{
+ size_t size = sizeof(struct dsa_switch) + n * sizeof(struct dsa_port);
+ struct dsa_switch *ds;
+ int i;
+
+ ds = devm_kzalloc(dev, size, GFP_KERNEL);
+ if (!ds)
+ return NULL;
+
+ ds->dev = dev;
+ ds->num_ports = n;
+
+ for (i = 0; i < ds->num_ports; ++i) {
+ ds->ports[i].index = i;
+ ds->ports[i].ds = ds;
+ }
+
+ return ds;
+}
+EXPORT_SYMBOL_GPL(dsa_switch_alloc);
+
+int dsa_register_switch(struct dsa_switch *ds, struct device *dev)
{
int err;
mutex_lock(&dsa2_mutex);
- err = _dsa_register_switch(ds, np);
+ err = _dsa_register_switch(ds, dev);
mutex_unlock(&dsa2_mutex);
return err;
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 6cfd7388834e..0706a511244e 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -25,12 +25,8 @@ struct dsa_slave_priv {
struct sk_buff * (*xmit)(struct sk_buff *skb,
struct net_device *dev);
- /*
- * Which switch this port is a part of, and the port index
- * for this port.
- */
- struct dsa_switch *parent;
- u8 port;
+ /* DSA port data, such as switch, port index, etc. */
+ struct dsa_port *dp;
/*
* The phylib phy_device pointer for the PHY connected
@@ -42,17 +38,18 @@ struct dsa_slave_priv {
int old_pause;
int old_duplex;
- struct net_device *bridge_dev;
#ifdef CONFIG_NET_POLL_CONTROLLER
struct netpoll *netpoll;
#endif
+
+ /* TC context */
+ struct list_head mall_tc_list;
};
/* dsa.c */
-extern char dsa_driver_version[];
int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
- struct device_node *port_dn, int port);
-void dsa_cpu_dsa_destroy(struct device_node *port_dn);
+ struct dsa_port *dport, int port);
+void dsa_cpu_dsa_destroy(struct dsa_port *dport);
const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds);
void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds);
@@ -66,8 +63,12 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
void dsa_slave_destroy(struct net_device *slave_dev);
int dsa_slave_suspend(struct net_device *slave_dev);
int dsa_slave_resume(struct net_device *slave_dev);
-int dsa_slave_netdevice_event(struct notifier_block *unused,
- unsigned long event, void *ptr);
+int dsa_slave_register_notifier(void);
+void dsa_slave_unregister_notifier(void);
+
+/* switch.c */
+int dsa_switch_register_notifier(struct dsa_switch *ds);
+void dsa_switch_unregister_notifier(struct dsa_switch *ds);
/* tag_dsa.c */
extern const struct dsa_device_ops dsa_netdev_ops;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 30e2e21d7619..c34872e1febc 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -16,12 +16,28 @@
#include <linux/of_net.h>
#include <linux/of_mdio.h>
#include <linux/mdio.h>
+#include <linux/list.h>
#include <net/rtnetlink.h>
#include <net/switchdev.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_mirred.h>
#include <linux/if_bridge.h>
#include <linux/netpoll.h>
#include "dsa_priv.h"
+static bool dsa_slave_dev_check(struct net_device *dev);
+
+static int dsa_slave_notify(struct net_device *dev, unsigned long e, void *v)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct raw_notifier_head *nh = &p->dp->ds->dst->nh;
+ int err;
+
+ err = raw_notifier_call_chain(nh, e, v);
+
+ return notifier_to_errno(err);
+}
+
/* slave mii_bus handling ***************************************************/
static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
{
@@ -61,17 +77,20 @@ static int dsa_slave_get_iflink(const struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- return p->parent->dst->master_netdev->ifindex;
+ return p->dp->ds->dst->master_netdev->ifindex;
}
-static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p)
+static inline bool dsa_port_is_bridged(struct dsa_port *dp)
{
- return !!p->bridge_dev;
+ return !!dp->bridge_dev;
}
-static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state)
+static void dsa_slave_set_state(struct net_device *dev, u8 state)
{
- struct dsa_port *dp = &ds->ports[port];
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_port *dp = p->dp;
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
if (ds->ops->port_stp_state_set)
ds->ops->port_stp_state_set(ds, port, state);
@@ -96,9 +115,9 @@ static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state)
static int dsa_slave_open(struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct net_device *master = p->parent->dst->master_netdev;
- struct dsa_switch *ds = p->parent;
- u8 stp_state = dsa_port_is_bridged(p) ?
+ struct net_device *master = p->dp->ds->dst->master_netdev;
+ struct dsa_switch *ds = p->dp->ds;
+ u8 stp_state = dsa_port_is_bridged(p->dp) ?
BR_STATE_BLOCKING : BR_STATE_FORWARDING;
int err;
@@ -123,12 +142,12 @@ static int dsa_slave_open(struct net_device *dev)
}
if (ds->ops->port_enable) {
- err = ds->ops->port_enable(ds, p->port, p->phy);
+ err = ds->ops->port_enable(ds, p->dp->index, p->phy);
if (err)
goto clear_promisc;
}
- dsa_port_set_stp_state(ds, p->port, stp_state);
+ dsa_slave_set_state(dev, stp_state);
if (p->phy)
phy_start(p->phy);
@@ -151,8 +170,8 @@ out:
static int dsa_slave_close(struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct net_device *master = p->parent->dst->master_netdev;
- struct dsa_switch *ds = p->parent;
+ struct net_device *master = p->dp->ds->dst->master_netdev;
+ struct dsa_switch *ds = p->dp->ds;
if (p->phy)
phy_stop(p->phy);
@@ -168,9 +187,9 @@ static int dsa_slave_close(struct net_device *dev)
dev_uc_del(master, dev->dev_addr);
if (ds->ops->port_disable)
- ds->ops->port_disable(ds, p->port, p->phy);
+ ds->ops->port_disable(ds, p->dp->index, p->phy);
- dsa_port_set_stp_state(ds, p->port, BR_STATE_DISABLED);
+ dsa_slave_set_state(dev, BR_STATE_DISABLED);
return 0;
}
@@ -178,7 +197,7 @@ static int dsa_slave_close(struct net_device *dev)
static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct net_device *master = p->parent->dst->master_netdev;
+ struct net_device *master = p->dp->ds->dst->master_netdev;
if (change & IFF_ALLMULTI)
dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1);
@@ -189,7 +208,7 @@ static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
static void dsa_slave_set_rx_mode(struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct net_device *master = p->parent->dst->master_netdev;
+ struct net_device *master = p->dp->ds->dst->master_netdev;
dev_mc_sync(master, dev);
dev_uc_sync(master, dev);
@@ -198,7 +217,7 @@ static void dsa_slave_set_rx_mode(struct net_device *dev)
static int dsa_slave_set_mac_address(struct net_device *dev, void *a)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct net_device *master = p->parent->dst->master_netdev;
+ struct net_device *master = p->dp->ds->dst->master_netdev;
struct sockaddr *addr = a;
int err;
@@ -228,16 +247,17 @@ static int dsa_slave_port_vlan_add(struct net_device *dev,
struct switchdev_trans *trans)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_port *dp = p->dp;
+ struct dsa_switch *ds = dp->ds;
if (switchdev_trans_ph_prepare(trans)) {
if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add)
return -EOPNOTSUPP;
- return ds->ops->port_vlan_prepare(ds, p->port, vlan, trans);
+ return ds->ops->port_vlan_prepare(ds, dp->index, vlan, trans);
}
- ds->ops->port_vlan_add(ds, p->port, vlan, trans);
+ ds->ops->port_vlan_add(ds, dp->index, vlan, trans);
return 0;
}
@@ -246,12 +266,12 @@ static int dsa_slave_port_vlan_del(struct net_device *dev,
const struct switchdev_obj_port_vlan *vlan)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
if (!ds->ops->port_vlan_del)
return -EOPNOTSUPP;
- return ds->ops->port_vlan_del(ds, p->port, vlan);
+ return ds->ops->port_vlan_del(ds, p->dp->index, vlan);
}
static int dsa_slave_port_vlan_dump(struct net_device *dev,
@@ -259,10 +279,10 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev,
switchdev_obj_dump_cb_t *cb)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
if (ds->ops->port_vlan_dump)
- return ds->ops->port_vlan_dump(ds, p->port, vlan, cb);
+ return ds->ops->port_vlan_dump(ds, p->dp->index, vlan, cb);
return -EOPNOTSUPP;
}
@@ -272,16 +292,16 @@ static int dsa_slave_port_fdb_add(struct net_device *dev,
struct switchdev_trans *trans)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
if (switchdev_trans_ph_prepare(trans)) {
if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add)
return -EOPNOTSUPP;
- return ds->ops->port_fdb_prepare(ds, p->port, fdb, trans);
+ return ds->ops->port_fdb_prepare(ds, p->dp->index, fdb, trans);
}
- ds->ops->port_fdb_add(ds, p->port, fdb, trans);
+ ds->ops->port_fdb_add(ds, p->dp->index, fdb, trans);
return 0;
}
@@ -290,11 +310,11 @@ static int dsa_slave_port_fdb_del(struct net_device *dev,
const struct switchdev_obj_port_fdb *fdb)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
int ret = -EOPNOTSUPP;
if (ds->ops->port_fdb_del)
- ret = ds->ops->port_fdb_del(ds, p->port, fdb);
+ ret = ds->ops->port_fdb_del(ds, p->dp->index, fdb);
return ret;
}
@@ -304,10 +324,10 @@ static int dsa_slave_port_fdb_dump(struct net_device *dev,
switchdev_obj_dump_cb_t *cb)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
if (ds->ops->port_fdb_dump)
- return ds->ops->port_fdb_dump(ds, p->port, fdb, cb);
+ return ds->ops->port_fdb_dump(ds, p->dp->index, fdb, cb);
return -EOPNOTSUPP;
}
@@ -317,16 +337,16 @@ static int dsa_slave_port_mdb_add(struct net_device *dev,
struct switchdev_trans *trans)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
if (switchdev_trans_ph_prepare(trans)) {
if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add)
return -EOPNOTSUPP;
- return ds->ops->port_mdb_prepare(ds, p->port, mdb, trans);
+ return ds->ops->port_mdb_prepare(ds, p->dp->index, mdb, trans);
}
- ds->ops->port_mdb_add(ds, p->port, mdb, trans);
+ ds->ops->port_mdb_add(ds, p->dp->index, mdb, trans);
return 0;
}
@@ -335,10 +355,10 @@ static int dsa_slave_port_mdb_del(struct net_device *dev,
const struct switchdev_obj_port_mdb *mdb)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
if (ds->ops->port_mdb_del)
- return ds->ops->port_mdb_del(ds, p->port, mdb);
+ return ds->ops->port_mdb_del(ds, p->dp->index, mdb);
return -EOPNOTSUPP;
}
@@ -348,10 +368,10 @@ static int dsa_slave_port_mdb_dump(struct net_device *dev,
switchdev_obj_dump_cb_t *cb)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
if (ds->ops->port_mdb_dump)
- return ds->ops->port_mdb_dump(ds, p->port, mdb, cb);
+ return ds->ops->port_mdb_dump(ds, p->dp->index, mdb, cb);
return -EOPNOTSUPP;
}
@@ -371,12 +391,12 @@ static int dsa_slave_stp_state_set(struct net_device *dev,
struct switchdev_trans *trans)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
if (switchdev_trans_ph_prepare(trans))
return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP;
- dsa_port_set_stp_state(ds, p->port, attr->u.stp_state);
+ dsa_slave_set_state(dev, attr->u.stp_state);
return 0;
}
@@ -386,14 +406,14 @@ static int dsa_slave_vlan_filtering(struct net_device *dev,
struct switchdev_trans *trans)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
/* bridge skips -EOPNOTSUPP, so skip the prepare phase */
if (switchdev_trans_ph_prepare(trans))
return 0;
if (ds->ops->port_vlan_filtering)
- return ds->ops->port_vlan_filtering(ds, p->port,
+ return ds->ops->port_vlan_filtering(ds, p->dp->index,
attr->u.vlan_filtering);
return 0;
@@ -404,7 +424,7 @@ static int dsa_fastest_ageing_time(struct dsa_switch *ds,
{
int i;
- for (i = 0; i < DSA_MAX_PORTS; ++i) {
+ for (i = 0; i < ds->num_ports; ++i) {
struct dsa_port *dp = &ds->ports[i];
if (dp && dp->ageing_time && dp->ageing_time < ageing_time)
@@ -419,7 +439,7 @@ static int dsa_slave_ageing_time(struct net_device *dev,
struct switchdev_trans *trans)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
unsigned long ageing_jiffies = clock_t_to_jiffies(attr->u.ageing_time);
unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies);
@@ -428,7 +448,7 @@ static int dsa_slave_ageing_time(struct net_device *dev,
return 0;
/* Keep the fastest ageing time in case of multiple bridges */
- ds->ports[p->port].ageing_time = ageing_time;
+ p->dp->ageing_time = ageing_time;
ageing_time = dsa_fastest_ageing_time(ds, ageing_time);
if (ds->ops->set_ageing_time)
@@ -553,39 +573,58 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
struct net_device *br)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
- int ret = -EOPNOTSUPP;
+ struct dsa_notifier_bridge_info info = {
+ .sw_index = p->dp->ds->index,
+ .port = p->dp->index,
+ .br = br,
+ };
+ int err;
+
+ /* Here the port is already bridged. Reflect the current configuration
+ * so that drivers can program their chips accordingly.
+ */
+ p->dp->bridge_dev = br;
- p->bridge_dev = br;
+ err = dsa_slave_notify(dev, DSA_NOTIFIER_BRIDGE_JOIN, &info);
- if (ds->ops->port_bridge_join)
- ret = ds->ops->port_bridge_join(ds, p->port, br);
+ /* The bridging is rolled back on error */
+ if (err)
+ p->dp->bridge_dev = NULL;
- return ret == -EOPNOTSUPP ? 0 : ret;
+ return err;
}
-static void dsa_slave_bridge_port_leave(struct net_device *dev)
+static void dsa_slave_bridge_port_leave(struct net_device *dev,
+ struct net_device *br)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
-
+ struct dsa_notifier_bridge_info info = {
+ .sw_index = p->dp->ds->index,
+ .port = p->dp->index,
+ .br = br,
+ };
+ int err;
- if (ds->ops->port_bridge_leave)
- ds->ops->port_bridge_leave(ds, p->port);
+ /* Here the port is already unbridged. Reflect the current configuration
+ * so that drivers can program their chips accordingly.
+ */
+ p->dp->bridge_dev = NULL;
- p->bridge_dev = NULL;
+ err = dsa_slave_notify(dev, DSA_NOTIFIER_BRIDGE_LEAVE, &info);
+ if (err)
+ netdev_err(dev, "failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n");
/* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
* so allow it to be in BR_STATE_FORWARDING to be kept functional
*/
- dsa_port_set_stp_state(ds, p->port, BR_STATE_FORWARDING);
+ dsa_slave_set_state(dev, BR_STATE_FORWARDING);
}
static int dsa_slave_port_attr_get(struct net_device *dev,
struct switchdev_attr *attr)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
switch (attr->id) {
case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
@@ -633,7 +672,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
/* Queue the SKB for transmission on the parent interface, but
* do not modify its EtherType
*/
- nskb->dev = p->parent->dst->master_netdev;
+ nskb->dev = p->dp->ds->dst->master_netdev;
dev_queue_xmit(nskb);
return NETDEV_TX_OK;
@@ -641,28 +680,26 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
/* ethtool operations *******************************************************/
static int
-dsa_slave_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+dsa_slave_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *cmd)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- int err;
+ int err = -EOPNOTSUPP;
- err = -EOPNOTSUPP;
- if (p->phy != NULL) {
- err = phy_read_status(p->phy);
- if (err == 0)
- err = phy_ethtool_gset(p->phy, cmd);
- }
+ if (p->phy != NULL)
+ err = phy_ethtool_ksettings_get(p->phy, cmd);
return err;
}
static int
-dsa_slave_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+dsa_slave_set_link_ksettings(struct net_device *dev,
+ const struct ethtool_link_ksettings *cmd)
{
struct dsa_slave_priv *p = netdev_priv(dev);
if (p->phy != NULL)
- return phy_ethtool_sset(p->phy, cmd);
+ return phy_ethtool_ksettings_set(p->phy, cmd);
return -EOPNOTSUPP;
}
@@ -671,7 +708,6 @@ static void dsa_slave_get_drvinfo(struct net_device *dev,
struct ethtool_drvinfo *drvinfo)
{
strlcpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver));
- strlcpy(drvinfo->version, dsa_driver_version, sizeof(drvinfo->version));
strlcpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version));
strlcpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info));
}
@@ -679,10 +715,10 @@ static void dsa_slave_get_drvinfo(struct net_device *dev,
static int dsa_slave_get_regs_len(struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
if (ds->ops->get_regs_len)
- return ds->ops->get_regs_len(ds, p->port);
+ return ds->ops->get_regs_len(ds, p->dp->index);
return -EOPNOTSUPP;
}
@@ -691,10 +727,10 @@ static void
dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
if (ds->ops->get_regs)
- ds->ops->get_regs(ds, p->port, regs, _p);
+ ds->ops->get_regs(ds, p->dp->index, regs, _p);
}
static int dsa_slave_nway_reset(struct net_device *dev)
@@ -722,7 +758,7 @@ static u32 dsa_slave_get_link(struct net_device *dev)
static int dsa_slave_get_eeprom_len(struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
if (ds->cd && ds->cd->eeprom_len)
return ds->cd->eeprom_len;
@@ -737,7 +773,7 @@ static int dsa_slave_get_eeprom(struct net_device *dev,
struct ethtool_eeprom *eeprom, u8 *data)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
if (ds->ops->get_eeprom)
return ds->ops->get_eeprom(ds, eeprom, data);
@@ -749,7 +785,7 @@ static int dsa_slave_set_eeprom(struct net_device *dev,
struct ethtool_eeprom *eeprom, u8 *data)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
if (ds->ops->set_eeprom)
return ds->ops->set_eeprom(ds, eeprom, data);
@@ -761,7 +797,7 @@ static void dsa_slave_get_strings(struct net_device *dev,
uint32_t stringset, uint8_t *data)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
if (stringset == ETH_SS_STATS) {
int len = ETH_GSTRING_LEN;
@@ -771,7 +807,7 @@ static void dsa_slave_get_strings(struct net_device *dev,
strncpy(data + 2 * len, "rx_packets", len);
strncpy(data + 3 * len, "rx_bytes", len);
if (ds->ops->get_strings)
- ds->ops->get_strings(ds, p->port, data + 4 * len);
+ ds->ops->get_strings(ds, p->dp->index, data + 4 * len);
}
}
@@ -780,7 +816,7 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev,
uint64_t *data)
{
struct dsa_switch_tree *dst = dev->dsa_ptr;
- struct dsa_switch *ds = dst->ds[0];
+ struct dsa_switch *ds = dst->cpu_switch;
s8 cpu_port = dst->cpu_port;
int count = 0;
@@ -797,7 +833,7 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev,
static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset)
{
struct dsa_switch_tree *dst = dev->dsa_ptr;
- struct dsa_switch *ds = dst->ds[0];
+ struct dsa_switch *ds = dst->cpu_switch;
int count = 0;
if (dst->master_ethtool_ops.get_sset_count)
@@ -813,7 +849,7 @@ static void dsa_cpu_port_get_strings(struct net_device *dev,
uint32_t stringset, uint8_t *data)
{
struct dsa_switch_tree *dst = dev->dsa_ptr;
- struct dsa_switch *ds = dst->ds[0];
+ struct dsa_switch *ds = dst->cpu_switch;
s8 cpu_port = dst->cpu_port;
int len = ETH_GSTRING_LEN;
int mcount = 0, count;
@@ -852,20 +888,20 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev,
uint64_t *data)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
data[0] = dev->stats.tx_packets;
data[1] = dev->stats.tx_bytes;
data[2] = dev->stats.rx_packets;
data[3] = dev->stats.rx_bytes;
if (ds->ops->get_ethtool_stats)
- ds->ops->get_ethtool_stats(ds, p->port, data + 4);
+ ds->ops->get_ethtool_stats(ds, p->dp->index, data + 4);
}
static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
if (sset == ETH_SS_STATS) {
int count;
@@ -883,20 +919,20 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
if (ds->ops->get_wol)
- ds->ops->get_wol(ds, p->port, w);
+ ds->ops->get_wol(ds, p->dp->index, w);
}
static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
int ret = -EOPNOTSUPP;
if (ds->ops->set_wol)
- ret = ds->ops->set_wol(ds, p->port, w);
+ ret = ds->ops->set_wol(ds, p->dp->index, w);
return ret;
}
@@ -904,13 +940,13 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
int ret;
if (!ds->ops->set_eee)
return -EOPNOTSUPP;
- ret = ds->ops->set_eee(ds, p->port, p->phy, e);
+ ret = ds->ops->set_eee(ds, p->dp->index, p->phy, e);
if (ret)
return ret;
@@ -923,13 +959,13 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
int ret;
if (!ds->ops->get_eee)
return -EOPNOTSUPP;
- ret = ds->ops->get_eee(ds, p->port, e);
+ ret = ds->ops->get_eee(ds, p->dp->index, e);
if (ret)
return ret;
@@ -944,7 +980,7 @@ static int dsa_slave_netpoll_setup(struct net_device *dev,
struct netpoll_info *ni)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
struct net_device *master = ds->dst->master_netdev;
struct netpoll *netpoll;
int err = 0;
@@ -982,6 +1018,144 @@ static void dsa_slave_poll_controller(struct net_device *dev)
}
#endif
+static int dsa_slave_get_phys_port_name(struct net_device *dev,
+ char *name, size_t len)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+
+ if (snprintf(name, len, "p%d", p->dp->index) >= len)
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct dsa_mall_tc_entry *
+dsa_slave_mall_tc_entry_find(struct dsa_slave_priv *p,
+ unsigned long cookie)
+{
+ struct dsa_mall_tc_entry *mall_tc_entry;
+
+ list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list)
+ if (mall_tc_entry->cookie == cookie)
+ return mall_tc_entry;
+
+ return NULL;
+}
+
+static int dsa_slave_add_cls_matchall(struct net_device *dev,
+ __be16 protocol,
+ struct tc_cls_matchall_offload *cls,
+ bool ingress)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_mall_tc_entry *mall_tc_entry;
+ struct dsa_switch *ds = p->dp->ds;
+ struct net *net = dev_net(dev);
+ struct dsa_slave_priv *to_p;
+ struct net_device *to_dev;
+ const struct tc_action *a;
+ int err = -EOPNOTSUPP;
+ LIST_HEAD(actions);
+ int ifindex;
+
+ if (!ds->ops->port_mirror_add)
+ return err;
+
+ if (!tc_single_action(cls->exts))
+ return err;
+
+ tcf_exts_to_list(cls->exts, &actions);
+ a = list_first_entry(&actions, struct tc_action, list);
+
+ if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) {
+ struct dsa_mall_mirror_tc_entry *mirror;
+
+ ifindex = tcf_mirred_ifindex(a);
+ to_dev = __dev_get_by_index(net, ifindex);
+ if (!to_dev)
+ return -EINVAL;
+
+ if (!dsa_slave_dev_check(to_dev))
+ return -EOPNOTSUPP;
+
+ mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+ if (!mall_tc_entry)
+ return -ENOMEM;
+
+ mall_tc_entry->cookie = cls->cookie;
+ mall_tc_entry->type = DSA_PORT_MALL_MIRROR;
+ mirror = &mall_tc_entry->mirror;
+
+ to_p = netdev_priv(to_dev);
+
+ mirror->to_local_port = to_p->dp->index;
+ mirror->ingress = ingress;
+
+ err = ds->ops->port_mirror_add(ds, p->dp->index, mirror,
+ ingress);
+ if (err) {
+ kfree(mall_tc_entry);
+ return err;
+ }
+
+ list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
+ }
+
+ return 0;
+}
+
+static void dsa_slave_del_cls_matchall(struct net_device *dev,
+ struct tc_cls_matchall_offload *cls)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_mall_tc_entry *mall_tc_entry;
+ struct dsa_switch *ds = p->dp->ds;
+
+ if (!ds->ops->port_mirror_del)
+ return;
+
+ mall_tc_entry = dsa_slave_mall_tc_entry_find(p, cls->cookie);
+ if (!mall_tc_entry)
+ return;
+
+ list_del(&mall_tc_entry->list);
+
+ switch (mall_tc_entry->type) {
+ case DSA_PORT_MALL_MIRROR:
+ ds->ops->port_mirror_del(ds, p->dp->index,
+ &mall_tc_entry->mirror);
+ break;
+ default:
+ WARN_ON(1);
+ }
+
+ kfree(mall_tc_entry);
+}
+
+static int dsa_slave_setup_tc(struct net_device *dev, u32 handle,
+ __be16 protocol, struct tc_to_netdev *tc)
+{
+ bool ingress = TC_H_MAJ(handle) == TC_H_MAJ(TC_H_INGRESS);
+ int ret = -EOPNOTSUPP;
+
+ switch (tc->type) {
+ case TC_SETUP_MATCHALL:
+ switch (tc->cls_mall->command) {
+ case TC_CLSMATCHALL_REPLACE:
+ return dsa_slave_add_cls_matchall(dev, protocol,
+ tc->cls_mall,
+ ingress);
+ case TC_CLSMATCHALL_DESTROY:
+ dsa_slave_del_cls_matchall(dev, tc->cls_mall);
+ return 0;
+ }
+ default:
+ break;
+ }
+
+ return ret;
+}
+
void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops)
{
ops->get_sset_count = dsa_cpu_port_get_sset_count;
@@ -989,9 +1163,31 @@ void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops)
ops->get_strings = dsa_cpu_port_get_strings;
}
+static int dsa_slave_get_rxnfc(struct net_device *dev,
+ struct ethtool_rxnfc *nfc, u32 *rule_locs)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->dp->ds;
+
+ if (!ds->ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ return ds->ops->get_rxnfc(ds, p->dp->index, nfc, rule_locs);
+}
+
+static int dsa_slave_set_rxnfc(struct net_device *dev,
+ struct ethtool_rxnfc *nfc)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->dp->ds;
+
+ if (!ds->ops->set_rxnfc)
+ return -EOPNOTSUPP;
+
+ return ds->ops->set_rxnfc(ds, p->dp->index, nfc);
+}
+
static const struct ethtool_ops dsa_slave_ethtool_ops = {
- .get_settings = dsa_slave_get_settings,
- .set_settings = dsa_slave_set_settings,
.get_drvinfo = dsa_slave_get_drvinfo,
.get_regs_len = dsa_slave_get_regs_len,
.get_regs = dsa_slave_get_regs,
@@ -1007,6 +1203,10 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
.get_wol = dsa_slave_get_wol,
.set_eee = dsa_slave_set_eee,
.get_eee = dsa_slave_get_eee,
+ .get_link_ksettings = dsa_slave_get_link_ksettings,
+ .set_link_ksettings = dsa_slave_set_link_ksettings,
+ .get_rxnfc = dsa_slave_get_rxnfc,
+ .set_rxnfc = dsa_slave_set_rxnfc,
};
static const struct net_device_ops dsa_slave_netdev_ops = {
@@ -1029,6 +1229,8 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
.ndo_bridge_getlink = switchdev_port_bridge_getlink,
.ndo_bridge_setlink = switchdev_port_bridge_setlink,
.ndo_bridge_dellink = switchdev_port_bridge_dellink,
+ .ndo_get_phys_port_name = dsa_slave_get_phys_port_name,
+ .ndo_setup_tc = dsa_slave_setup_tc,
};
static const struct switchdev_ops dsa_slave_switchdev_ops = {
@@ -1046,7 +1248,7 @@ static struct device_type dsa_type = {
static void dsa_slave_adjust_link(struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
unsigned int status_changed = 0;
if (p->old_link != p->phy->link) {
@@ -1065,7 +1267,7 @@ static void dsa_slave_adjust_link(struct net_device *dev)
}
if (ds->ops->adjust_link && status_changed)
- ds->ops->adjust_link(ds, p->port, p->phy);
+ ds->ops->adjust_link(ds, p->dp->index, p->phy);
if (status_changed)
phy_print_status(p->phy);
@@ -1079,9 +1281,9 @@ static int dsa_slave_fixed_link_update(struct net_device *dev,
if (dev) {
p = netdev_priv(dev);
- ds = p->parent;
+ ds = p->dp->ds;
if (ds->ops->fixed_link_update)
- ds->ops->fixed_link_update(ds, p->port, status);
+ ds->ops->fixed_link_update(ds, p->dp->index, status);
}
return 0;
@@ -1092,7 +1294,7 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p,
struct net_device *slave_dev,
int addr)
{
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
p->phy = mdiobus_get_phy(ds->slave_mii_bus, addr);
if (!p->phy) {
@@ -1103,22 +1305,20 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p,
/* Use already configured phy mode */
if (p->phy_interface == PHY_INTERFACE_MODE_NA)
p->phy_interface = p->phy->interface;
- phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link,
- p->phy_interface);
-
- return 0;
+ return phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link,
+ p->phy_interface);
}
static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
struct net_device *slave_dev)
{
- struct dsa_switch *ds = p->parent;
+ struct dsa_switch *ds = p->dp->ds;
struct device_node *phy_dn, *port_dn;
bool phy_is_fixed = false;
u32 phy_flags = 0;
int mode, ret;
- port_dn = ds->ports[p->port].dn;
+ port_dn = p->dp->dn;
mode = of_get_phy_mode(port_dn);
if (mode < 0)
mode = PHY_INTERFACE_MODE_NA;
@@ -1139,7 +1339,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
}
if (ds->ops->get_phy_flags)
- phy_flags = ds->ops->get_phy_flags(ds, p->port);
+ phy_flags = ds->ops->get_phy_flags(ds, p->dp->index);
if (phy_dn) {
int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn);
@@ -1174,9 +1374,10 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
* MDIO bus instead
*/
if (!p->phy) {
- ret = dsa_slave_phy_connect(p, slave_dev, p->port);
+ ret = dsa_slave_phy_connect(p, slave_dev, p->dp->index);
if (ret) {
- netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->port, ret);
+ netdev_err(slave_dev, "failed to connect to port %d: %d\n",
+ p->dp->index, ret);
if (phy_is_fixed)
of_phy_deregister_fixed_link(port_dn);
return ret;
@@ -1201,6 +1402,8 @@ int dsa_slave_suspend(struct net_device *slave_dev)
{
struct dsa_slave_priv *p = netdev_priv(slave_dev);
+ netif_device_detach(slave_dev);
+
if (p->phy) {
phy_stop(p->phy);
p->old_pause = -1;
@@ -1244,12 +1447,15 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
if (slave_dev == NULL)
return -ENOMEM;
- slave_dev->features = master->vlan_features;
+ slave_dev->features = master->vlan_features | NETIF_F_HW_TC;
+ slave_dev->hw_features |= NETIF_F_HW_TC;
slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
eth_hw_addr_inherit(slave_dev, master);
slave_dev->priv_flags |= IFF_NO_QUEUE;
slave_dev->netdev_ops = &dsa_slave_netdev_ops;
slave_dev->switchdev_ops = &dsa_slave_switchdev_ops;
+ slave_dev->min_mtu = 0;
+ slave_dev->max_mtu = ETH_MAX_MTU;
SET_NETDEV_DEVTYPE(slave_dev, &dsa_type);
netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one,
@@ -1260,8 +1466,8 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
slave_dev->vlan_features = master->vlan_features;
p = netdev_priv(slave_dev);
- p->parent = ds;
- p->port = port;
+ p->dp = &ds->ports[port];
+ INIT_LIST_HEAD(&p->mall_tc_list);
p->xmit = dst->tag_ops->xmit;
p->old_pause = -1;
@@ -1294,10 +1500,9 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
void dsa_slave_destroy(struct net_device *slave_dev)
{
struct dsa_slave_priv *p = netdev_priv(slave_dev);
- struct dsa_switch *ds = p->parent;
struct device_node *port_dn;
- port_dn = ds->ports[p->port].dn;
+ port_dn = p->dp->dn;
netif_carrier_off(slave_dev);
if (p->phy) {
@@ -1315,46 +1520,52 @@ static bool dsa_slave_dev_check(struct net_device *dev)
return dev->netdev_ops == &dsa_slave_netdev_ops;
}
-static int dsa_slave_port_upper_event(struct net_device *dev,
- unsigned long event, void *ptr)
+static int dsa_slave_changeupper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
{
- struct netdev_notifier_changeupper_info *info = ptr;
- struct net_device *upper = info->upper_dev;
- int err = 0;
+ int err = NOTIFY_DONE;
- switch (event) {
- case NETDEV_CHANGEUPPER:
- if (netif_is_bridge_master(upper)) {
- if (info->linking)
- err = dsa_slave_bridge_port_join(dev, upper);
- else
- dsa_slave_bridge_port_leave(dev);
+ if (netif_is_bridge_master(info->upper_dev)) {
+ if (info->linking) {
+ err = dsa_slave_bridge_port_join(dev, info->upper_dev);
+ err = notifier_from_errno(err);
+ } else {
+ dsa_slave_bridge_port_leave(dev, info->upper_dev);
+ err = NOTIFY_OK;
}
-
- break;
}
- return notifier_from_errno(err);
+ return err;
}
-static int dsa_slave_port_event(struct net_device *dev, unsigned long event,
- void *ptr)
+static int dsa_slave_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
{
- switch (event) {
- case NETDEV_CHANGEUPPER:
- return dsa_slave_port_upper_event(dev, event, ptr);
- }
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (dev->netdev_ops != &dsa_slave_netdev_ops)
+ return NOTIFY_DONE;
+
+ if (event == NETDEV_CHANGEUPPER)
+ return dsa_slave_changeupper(dev, ptr);
return NOTIFY_DONE;
}
-int dsa_slave_netdevice_event(struct notifier_block *unused,
- unsigned long event, void *ptr)
+static struct notifier_block dsa_slave_nb __read_mostly = {
+ .notifier_call = dsa_slave_netdevice_event,
+};
+
+int dsa_slave_register_notifier(void)
{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ return register_netdevice_notifier(&dsa_slave_nb);
+}
- if (dsa_slave_dev_check(dev))
- return dsa_slave_port_event(dev, event, ptr);
+void dsa_slave_unregister_notifier(void)
+{
+ int err;
- return NOTIFY_DONE;
+ err = unregister_netdevice_notifier(&dsa_slave_nb);
+ if (err)
+ pr_err("DSA: failed to unregister slave notifier (%d)\n", err);
}
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
new file mode 100644
index 000000000000..6456dacf9ae9
--- /dev/null
+++ b/net/dsa/switch.c
@@ -0,0 +1,85 @@
+/*
+ * Handling of a single switch chip, part of a switch fabric
+ *
+ * Copyright (c) 2017 Vivien Didelot <vivien.didelot@savoirfairelinux.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/notifier.h>
+#include <net/dsa.h>
+
+static int dsa_switch_bridge_join(struct dsa_switch *ds,
+ struct dsa_notifier_bridge_info *info)
+{
+ if (ds->index == info->sw_index && ds->ops->port_bridge_join)
+ return ds->ops->port_bridge_join(ds, info->port, info->br);
+
+ if (ds->index != info->sw_index)
+ dev_dbg(ds->dev, "crosschip DSA port %d.%d bridged to %s\n",
+ info->sw_index, info->port, netdev_name(info->br));
+
+ return 0;
+}
+
+static int dsa_switch_bridge_leave(struct dsa_switch *ds,
+ struct dsa_notifier_bridge_info *info)
+{
+ if (ds->index == info->sw_index && ds->ops->port_bridge_leave)
+ ds->ops->port_bridge_leave(ds, info->port, info->br);
+
+ if (ds->index != info->sw_index)
+ dev_dbg(ds->dev, "crosschip DSA port %d.%d unbridged from %s\n",
+ info->sw_index, info->port, netdev_name(info->br));
+
+ return 0;
+}
+
+static int dsa_switch_event(struct notifier_block *nb,
+ unsigned long event, void *info)
+{
+ struct dsa_switch *ds = container_of(nb, struct dsa_switch, nb);
+ int err;
+
+ switch (event) {
+ case DSA_NOTIFIER_BRIDGE_JOIN:
+ err = dsa_switch_bridge_join(ds, info);
+ break;
+ case DSA_NOTIFIER_BRIDGE_LEAVE:
+ err = dsa_switch_bridge_leave(ds, info);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ /* Non-switchdev operations cannot be rolled back. If a DSA driver
+ * returns an error during the chained call, switch chips may be in an
+ * inconsistent state.
+ */
+ if (err)
+ dev_dbg(ds->dev, "breaking chain for DSA event %lu (%d)\n",
+ event, err);
+
+ return notifier_from_errno(err);
+}
+
+int dsa_switch_register_notifier(struct dsa_switch *ds)
+{
+ ds->nb.notifier_call = dsa_switch_event;
+
+ return raw_notifier_chain_register(&ds->dst->nh, &ds->nb);
+}
+
+void dsa_switch_unregister_notifier(struct dsa_switch *ds)
+{
+ int err;
+
+ err = raw_notifier_chain_unregister(&ds->dst->nh, &ds->nb);
+ if (err)
+ dev_err(ds->dev, "failed to unregister notifier (%d)\n", err);
+}
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index 21bffde6e4bf..5d925b6b2bb1 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -80,9 +80,9 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev
((skb->priority << BRCM_IG_TC_SHIFT) & BRCM_IG_TC_MASK);
brcm_tag[1] = 0;
brcm_tag[2] = 0;
- if (p->port == 8)
+ if (p->dp->index == 8)
brcm_tag[2] = BRCM_IG_DSTMAP2_MASK;
- brcm_tag[3] = (1 << p->port) & BRCM_IG_DSTMAP1_MASK;
+ brcm_tag[3] = (1 << p->dp->index) & BRCM_IG_DSTMAP1_MASK;
return skb;
@@ -102,7 +102,7 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
if (unlikely(dst == NULL))
goto out_drop;
- ds = dst->ds[0];
+ ds = dst->cpu_switch;
skb = skb_unshare(skb, GFP_ATOMIC);
if (skb == NULL)
@@ -121,13 +121,14 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
/* We should never see a reserved reason code without knowing how to
* handle it
*/
- WARN_ON(brcm_tag[2] & BRCM_EG_RC_RSVD);
+ if (unlikely(brcm_tag[2] & BRCM_EG_RC_RSVD))
+ goto out_drop;
/* Locate which port this is coming from */
source_port = brcm_tag[3] & BRCM_EG_PID_MASK;
/* Validate port against switch setup, either the port is totally */
- if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
+ if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
goto out_drop;
/* Remove Broadcom tag and update checksum */
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index bce79ffe342b..72579ceea381 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -33,8 +33,8 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
* Construct tagged FROM_CPU DSA tag from 802.1q tag.
*/
dsa_header = skb->data + 2 * ETH_ALEN;
- dsa_header[0] = 0x60 | p->parent->index;
- dsa_header[1] = p->port << 3;
+ dsa_header[0] = 0x60 | p->dp->ds->index;
+ dsa_header[1] = p->dp->index << 3;
/*
* Move CFI field from byte 2 to byte 1.
@@ -54,8 +54,8 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
* Construct untagged FROM_CPU DSA tag.
*/
dsa_header = skb->data + 2 * ETH_ALEN;
- dsa_header[0] = 0x40 | p->parent->index;
- dsa_header[1] = p->port << 3;
+ dsa_header[0] = 0x40 | p->dp->ds->index;
+ dsa_header[1] = p->dp->index << 3;
dsa_header[2] = 0x00;
dsa_header[3] = 0x00;
}
@@ -114,7 +114,7 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
if (!ds)
goto out_drop;
- if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
+ if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
goto out_drop;
/*
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
index 6c1720e88537..648c051817a1 100644
--- a/net/dsa/tag_edsa.c
+++ b/net/dsa/tag_edsa.c
@@ -42,8 +42,8 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
edsa_header[1] = ETH_P_EDSA & 0xff;
edsa_header[2] = 0x00;
edsa_header[3] = 0x00;
- edsa_header[4] = 0x60 | p->parent->index;
- edsa_header[5] = p->port << 3;
+ edsa_header[4] = 0x60 | p->dp->ds->index;
+ edsa_header[5] = p->dp->index << 3;
/*
* Move CFI field from byte 6 to byte 5.
@@ -67,8 +67,8 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
edsa_header[1] = ETH_P_EDSA & 0xff;
edsa_header[2] = 0x00;
edsa_header[3] = 0x00;
- edsa_header[4] = 0x40 | p->parent->index;
- edsa_header[5] = p->port << 3;
+ edsa_header[4] = 0x40 | p->dp->ds->index;
+ edsa_header[5] = p->dp->index << 3;
edsa_header[6] = 0x00;
edsa_header[7] = 0x00;
}
@@ -127,7 +127,7 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
if (!ds)
goto out_drop;
- if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
+ if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
goto out_drop;
/*
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 0c90cacee7aa..30240f343aea 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -54,7 +54,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
/* Set the version field, and set destination port information */
hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S |
QCA_HDR_XMIT_FROM_CPU |
- BIT(p->port);
+ BIT(p->dp->index);
*phdr = htons(hdr);
@@ -104,7 +104,7 @@ static int qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
/* This protocol doesn't support cascading multiple switches so it's
* safe to assume the switch is first in the tree
*/
- ds = dst->ds[0];
+ ds = dst->cpu_switch;
if (!ds)
goto out_drop;
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index 5e3903eb1afa..26f977176978 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -50,7 +50,7 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
trailer = skb_put(nskb, 4);
trailer[0] = 0x80;
- trailer[1] = 1 << p->port;
+ trailer[1] = 1 << p->dp->index;
trailer[2] = 0x10;
trailer[3] = 0x00;
@@ -67,7 +67,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
if (unlikely(dst == NULL))
goto out_drop;
- ds = dst->ds[0];
+ ds = dst->cpu_switch;
skb = skb_unshare(skb, GFP_ATOMIC);
if (skb == NULL)
@@ -82,7 +82,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
goto out_drop;
source_port = trailer[1] & 7;
- if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev)
+ if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
goto out_drop;
pskb_trim_rcsum(skb, skb->len - 4);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 02acfff36028..1446810047f5 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -62,6 +62,7 @@
#include <net/dsa.h>
#include <net/flow_dissector.h>
#include <linux/uaccess.h>
+#include <net/pkt_sched.h>
__setup("ether=", netdev_boot_setup);
@@ -322,8 +323,7 @@ EXPORT_SYMBOL(eth_mac_addr);
*/
int eth_change_mtu(struct net_device *dev, int new_mtu)
{
- if (new_mtu < 68 || new_mtu > ETH_DATA_LEN)
- return -EINVAL;
+ netdev_warn(dev, "%s is deprecated\n", __func__);
dev->mtu = new_mtu;
return 0;
}
@@ -356,9 +356,12 @@ void ether_setup(struct net_device *dev)
dev->header_ops = &eth_header_ops;
dev->type = ARPHRD_ETHER;
dev->hard_header_len = ETH_HLEN;
+ dev->min_header_len = ETH_HLEN;
dev->mtu = ETH_DATA_LEN;
+ dev->min_mtu = ETH_MIN_MTU;
+ dev->max_mtu = ETH_DATA_LEN;
dev->addr_len = ETH_ALEN;
- dev->tx_queue_len = 1000; /* Ethernet wants good queues */
+ dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
dev->flags = IFF_BROADCAST|IFF_MULTICAST;
dev->priv_flags |= IFF_TX_SKB_SHARING;
@@ -390,6 +393,34 @@ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
}
EXPORT_SYMBOL(alloc_etherdev_mqs);
+static void devm_free_netdev(struct device *dev, void *res)
+{
+ free_netdev(*(struct net_device **)res);
+}
+
+struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
+ unsigned int txqs, unsigned int rxqs)
+{
+ struct net_device **dr;
+ struct net_device *netdev;
+
+ dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL);
+ if (!dr)
+ return NULL;
+
+ netdev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs);
+ if (!netdev) {
+ devres_free(dr);
+ return NULL;
+ }
+
+ *dr = netdev;
+ devres_add(dev, dr);
+
+ return netdev;
+}
+EXPORT_SYMBOL(devm_alloc_etherdev_mqs);
+
ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
{
return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr);
@@ -444,7 +475,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head,
out_unlock:
rcu_read_unlock();
out:
- NAPI_GRO_CB(skb)->flush |= flush;
+ skb_gro_flush_final(skb, pp, flush);
return pp;
}
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index 16737cd8dae8..c73160fb11e7 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -395,9 +395,10 @@ static struct device_type hsr_type = {
void hsr_dev_setup(struct net_device *dev)
{
- random_ether_addr(dev->dev_addr);
+ eth_hw_addr_random(dev);
ether_setup(dev);
+ dev->min_mtu = 0;
dev->header_ops = &hsr_header_ops;
dev->netdev_ops = &hsr_device_ops;
SET_NETDEV_DEVTYPE(dev, &hsr_type);
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index d4d1617f43a8..1ab30e7d3f99 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -131,13 +131,7 @@ static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = {
[HSR_A_IF2_SEQ] = { .type = NLA_U16 },
};
-static struct genl_family hsr_genl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = "HSR",
- .version = 1,
- .maxattr = HSR_A_MAX,
-};
+static struct genl_family hsr_genl_family;
static const struct genl_multicast_group hsr_mcgrps[] = {
{ .name = "hsr-network", },
@@ -467,6 +461,18 @@ static const struct genl_ops hsr_ops[] = {
},
};
+static struct genl_family hsr_genl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = "HSR",
+ .version = 1,
+ .maxattr = HSR_A_MAX,
+ .module = THIS_MODULE,
+ .ops = hsr_ops,
+ .n_ops = ARRAY_SIZE(hsr_ops),
+ .mcgrps = hsr_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(hsr_mcgrps),
+};
+
int __init hsr_netlink_init(void)
{
int rc;
@@ -475,8 +481,7 @@ int __init hsr_netlink_init(void)
if (rc)
goto fail_rtnl_link_register;
- rc = genl_register_family_with_ops_groups(&hsr_genl_family, hsr_ops,
- hsr_mcgrps);
+ rc = genl_register_family(&hsr_genl_family);
if (rc)
goto fail_genl_register_family;
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index f5b60388d02f..56080da4aa77 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -12,6 +12,7 @@
#include "hsr_slave.h"
#include <linux/etherdevice.h>
#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
#include "hsr_main.h"
#include "hsr_device.h"
#include "hsr_forward.h"
@@ -81,7 +82,7 @@ static int hsr_check_dev_ok(struct net_device *dev)
return -EINVAL;
}
- if (dev->priv_flags & IFF_802_1Q_VLAN) {
+ if (is_vlan_dev(dev)) {
netdev_info(dev, "HSR on top of VLAN is not yet supported in this driver.\n");
return -EINVAL;
}
diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h
index 5ac778962e4e..ac7c96b73ad5 100644
--- a/net/ieee802154/6lowpan/6lowpan_i.h
+++ b/net/ieee802154/6lowpan/6lowpan_i.h
@@ -7,7 +7,7 @@
#include <net/inet_frag.h>
#include <net/6lowpan.h>
-typedef unsigned __bitwise__ lowpan_rx_result;
+typedef unsigned __bitwise lowpan_rx_result;
#define RX_CONTINUE ((__force lowpan_rx_result) 0u)
#define RX_DROP_UNUSABLE ((__force lowpan_rx_result) 1u)
#define RX_DROP ((__force lowpan_rx_result) 2u)
diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile
index 4adfd4d5471b..9b92ade687a3 100644
--- a/net/ieee802154/Makefile
+++ b/net/ieee802154/Makefile
@@ -7,5 +7,3 @@ ieee802154-y := netlink.o nl-mac.o nl-phy.o nl_policy.o core.o \
ieee802154_socket-y := socket.o
CFLAGS_trace.o := -I$(src)
-
-ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index c8133c07ceee..6bde9e5a5503 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -28,14 +28,6 @@
static unsigned int ieee802154_seq_num;
static DEFINE_SPINLOCK(ieee802154_seq_lock);
-struct genl_family nl802154_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = IEEE802154_NL_NAME,
- .version = 1,
- .maxattr = IEEE802154_ATTR_MAX,
-};
-
/* Requests to userspace */
struct sk_buff *ieee802154_nl_create(int flags, u8 req)
{
@@ -139,11 +131,21 @@ static const struct genl_multicast_group ieee802154_mcgrps[] = {
[IEEE802154_BEACON_MCGRP] = { .name = IEEE802154_MCAST_BEACON_NAME, },
};
+struct genl_family nl802154_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = IEEE802154_NL_NAME,
+ .version = 1,
+ .maxattr = IEEE802154_ATTR_MAX,
+ .module = THIS_MODULE,
+ .ops = ieee8021154_ops,
+ .n_ops = ARRAY_SIZE(ieee8021154_ops),
+ .mcgrps = ieee802154_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(ieee802154_mcgrps),
+};
+
int __init ieee802154_nl_init(void)
{
- return genl_register_family_with_ops_groups(&nl802154_family,
- ieee8021154_ops,
- ieee802154_mcgrps);
+ return genl_register_family(&nl802154_family);
}
void ieee802154_nl_exit(void)
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
index 77d73014bde3..dc2960be51e0 100644
--- a/net/ieee802154/nl-phy.c
+++ b/net/ieee802154/nl-phy.c
@@ -286,9 +286,12 @@ int ieee802154_del_iface(struct sk_buff *skb, struct genl_info *info)
if (name[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0')
return -EINVAL; /* name should be null-terminated */
+ rc = -ENODEV;
dev = dev_get_by_name(genl_info_net(info), name);
if (!dev)
- return -ENODEV;
+ return rc;
+ if (dev->type != ARPHRD_IEEE802154)
+ goto out;
phy = dev->ieee802154_ptr->wpan_phy;
BUG_ON(!phy);
@@ -342,6 +345,7 @@ nla_put_failure:
nlmsg_free(msg);
out_dev:
wpan_phy_put(phy);
+out:
if (dev)
dev_put(dev);
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index d90a4ed5b8a0..fc60cd061f39 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -26,23 +26,8 @@
#include "rdev-ops.h"
#include "core.h"
-static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
- struct genl_info *info);
-
-static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
- struct genl_info *info);
-
/* the netlink family */
-static struct genl_family nl802154_fam = {
- .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */
- .name = NL802154_GENL_NAME, /* have users key off the name instead */
- .hdrsize = 0, /* no private header */
- .version = 1, /* no particular meaning now */
- .maxattr = NL802154_ATTR_MAX,
- .netnsok = true,
- .pre_doit = nl802154_pre_doit,
- .post_doit = nl802154_post_doit,
-};
+static struct genl_family nl802154_fam;
/* multicast groups */
enum nl802154_multicast_groups {
@@ -263,13 +248,14 @@ nl802154_prepare_wpan_dev_dump(struct sk_buff *skb,
if (!cb->args[0]) {
err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize,
- nl802154_fam.attrbuf, nl802154_fam.maxattr,
+ genl_family_attrbuf(&nl802154_fam),
+ nl802154_fam.maxattr,
nl802154_policy);
if (err)
goto out_unlock;
*wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk),
- nl802154_fam.attrbuf);
+ genl_family_attrbuf(&nl802154_fam));
if (IS_ERR(*wpan_dev)) {
err = PTR_ERR(*wpan_dev);
goto out_unlock;
@@ -575,7 +561,7 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb,
struct netlink_callback *cb,
struct nl802154_dump_wpan_phy_state *state)
{
- struct nlattr **tb = nl802154_fam.attrbuf;
+ struct nlattr **tb = genl_family_attrbuf(&nl802154_fam);
int ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize,
tb, nl802154_fam.maxattr, nl802154_policy);
@@ -2476,11 +2462,25 @@ static const struct genl_ops nl802154_ops[] = {
#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
};
+static struct genl_family nl802154_fam __ro_after_init = {
+ .name = NL802154_GENL_NAME, /* have users key off the name instead */
+ .hdrsize = 0, /* no private header */
+ .version = 1, /* no particular meaning now */
+ .maxattr = NL802154_ATTR_MAX,
+ .netnsok = true,
+ .pre_doit = nl802154_pre_doit,
+ .post_doit = nl802154_post_doit,
+ .module = THIS_MODULE,
+ .ops = nl802154_ops,
+ .n_ops = ARRAY_SIZE(nl802154_ops),
+ .mcgrps = nl802154_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(nl802154_mcgrps),
+};
+
/* initialisation/exit functions */
-int nl802154_init(void)
+int __init nl802154_init(void)
{
- return genl_register_family_with_ops_groups(&nl802154_fam, nl802154_ops,
- nl802154_mcgrps);
+ return genl_register_family(&nl802154_fam);
}
void nl802154_exit(void)
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index e0bd013a1e5e..eedba7670b51 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -279,7 +279,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
if (size > mtu) {
- pr_debug("size = %Zu, mtu = %u\n", size, mtu);
+ pr_debug("size = %zu, mtu = %u\n", size, mtu);
err = -EMSGSIZE;
goto out_dev;
}
@@ -645,7 +645,7 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
if (size > mtu) {
- pr_debug("size = %Zu, mtu = %u\n", size, mtu);
+ pr_debug("size = %zu, mtu = %u\n", size, mtu);
err = -EMSGSIZE;
goto out_dev;
}
diff --git a/net/ife/Kconfig b/net/ife/Kconfig
new file mode 100644
index 000000000000..31e48b652c7c
--- /dev/null
+++ b/net/ife/Kconfig
@@ -0,0 +1,16 @@
+#
+# IFE subsystem configuration
+#
+
+menuconfig NET_IFE
+ depends on NET
+ tristate "Inter-FE based on IETF ForCES InterFE LFB"
+ default n
+ help
+ Say Y here to add support of IFE encapsulation protocol
+ For details refer to netdev01 paper:
+ "Distributing Linux Traffic Control Classifier-Action Subsystem"
+ Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+
+ To compile this support as a module, choose M here: the module will
+ be called ife.
diff --git a/net/ife/Makefile b/net/ife/Makefile
new file mode 100644
index 000000000000..2a90d97746cc
--- /dev/null
+++ b/net/ife/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the IFE encapsulation protocol
+#
+
+obj-$(CONFIG_NET_IFE) += ife.o
diff --git a/net/ife/ife.c b/net/ife/ife.c
new file mode 100644
index 000000000000..f360341c72eb
--- /dev/null
+++ b/net/ife/ife.c
@@ -0,0 +1,142 @@
+/*
+ * net/ife/ife.c - Inter-FE protocol based on ForCES WG InterFE LFB
+ * Copyright (c) 2015 Jamal Hadi Salim <jhs@mojatatu.com>
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Refer to: draft-ietf-forces-interfelfb-03 and netdev01 paper:
+ * "Distributing Linux Traffic Control Classifier-Action Subsystem"
+ * Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/etherdevice.h>
+#include <net/ife.h>
+
+struct ifeheadr {
+ __be16 metalen;
+ u8 tlv_data[];
+};
+
+void *ife_encode(struct sk_buff *skb, u16 metalen)
+{
+ /* OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
+ * where ORIGDATA = original ethernet header ...
+ */
+ int hdrm = metalen + IFE_METAHDRLEN;
+ int total_push = hdrm + skb->dev->hard_header_len;
+ struct ifeheadr *ifehdr;
+ struct ethhdr *iethh; /* inner ether header */
+ int skboff = 0;
+ int err;
+
+ err = skb_cow_head(skb, total_push);
+ if (unlikely(err))
+ return NULL;
+
+ iethh = (struct ethhdr *) skb->data;
+
+ __skb_push(skb, total_push);
+ memcpy(skb->data, iethh, skb->dev->hard_header_len);
+ skb_reset_mac_header(skb);
+ skboff += skb->dev->hard_header_len;
+
+ /* total metadata length */
+ ifehdr = (struct ifeheadr *) (skb->data + skboff);
+ metalen += IFE_METAHDRLEN;
+ ifehdr->metalen = htons(metalen);
+
+ return ifehdr->tlv_data;
+}
+EXPORT_SYMBOL_GPL(ife_encode);
+
+void *ife_decode(struct sk_buff *skb, u16 *metalen)
+{
+ struct ifeheadr *ifehdr;
+ int total_pull;
+ u16 ifehdrln;
+
+ ifehdr = (struct ifeheadr *) (skb->data + skb->dev->hard_header_len);
+ ifehdrln = ntohs(ifehdr->metalen);
+ total_pull = skb->dev->hard_header_len + ifehdrln;
+
+ if (unlikely(ifehdrln < 2))
+ return NULL;
+
+ if (unlikely(!pskb_may_pull(skb, total_pull)))
+ return NULL;
+
+ skb_set_mac_header(skb, total_pull);
+ __skb_pull(skb, total_pull);
+ *metalen = ifehdrln - IFE_METAHDRLEN;
+
+ return &ifehdr->tlv_data;
+}
+EXPORT_SYMBOL_GPL(ife_decode);
+
+struct meta_tlvhdr {
+ __be16 type;
+ __be16 len;
+};
+
+/* Caller takes care of presenting data in network order
+ */
+void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen, u16 *totlen)
+{
+ struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
+
+ *dlen = ntohs(tlv->len) - NLA_HDRLEN;
+ *attrtype = ntohs(tlv->type);
+
+ if (totlen)
+ *totlen = nla_total_size(*dlen);
+
+ return skbdata + sizeof(struct meta_tlvhdr);
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_decode);
+
+void *ife_tlv_meta_next(void *skbdata)
+{
+ struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
+ u16 tlvlen = ntohs(tlv->len);
+
+ tlvlen = NLA_ALIGN(tlvlen);
+
+ return skbdata + tlvlen;
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_next);
+
+/* Caller takes care of presenting data in network order
+ */
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
+{
+ __be32 *tlv = (__be32 *) (skbdata);
+ u16 totlen = nla_total_size(dlen); /*alignment + hdr */
+ char *dptr = (char *) tlv + NLA_HDRLEN;
+ u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
+
+ *tlv = htonl(htlv);
+ memset(dptr, 0, totlen - NLA_HDRLEN);
+ memcpy(dptr, dval, dlen);
+
+ return totlen;
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
+
+MODULE_AUTHOR("Jamal Hadi Salim <jhs@mojatatu.com>");
+MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
+MODULE_DESCRIPTION("Inter-FE LFB action");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index b54b3ca939db..91a2557942fa 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -187,6 +187,7 @@ config NET_IPGRE_DEMUX
config NET_IP_TUNNEL
tristate
select DST_CACHE
+ select GRO_CELLS
default n
config NET_IPGRE
@@ -360,6 +361,19 @@ config INET_ESP
If unsure, say Y.
+config INET_ESP_OFFLOAD
+ tristate "IP: ESP transformation offload"
+ depends on INET_ESP
+ select XFRM_OFFLOAD
+ default n
+ ---help---
+ Support for ESP transformation offload. This makes sense
+ only if this system really does IPsec and want to do it
+ with high throughput. A typical desktop system does not
+ need it, even if it does IPsec.
+
+ If unsure, say N.
+
config INET_IPCOMP
tristate "IP: IPComp transformation"
select INET_XFRM_TUNNEL
@@ -430,6 +444,14 @@ config INET_UDP_DIAG
Support for UDP socket monitoring interface used by the ss tool.
If unsure, say Y.
+config INET_RAW_DIAG
+ tristate "RAW: socket monitoring interface"
+ depends on INET_DIAG && (IPV6 || IPV6=n)
+ default n
+ ---help---
+ Support for RAW socket monitoring interface used by the ss tool.
+ If unsure, say Y.
+
config INET_DIAG_DESTROY
bool "INET: allow privileged process to administratively close sockets"
depends on INET_DIAG
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index bc6a6c8b9bcd..c6d4238ff94a 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_NET_IPVTI) += ip_vti.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_AH) += ah4.o
obj-$(CONFIG_INET_ESP) += esp4.o
+obj-$(CONFIG_INET_ESP_OFFLOAD) += esp4_offload.o
obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
@@ -40,6 +41,7 @@ obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
+obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 215143246e4b..6b1fc6e4278e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -90,7 +90,7 @@
#include <linux/random.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/inet.h>
#include <linux/igmp.h>
@@ -374,8 +374,18 @@ lookup_protocol:
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
- if (err)
+ if (err) {
sk_common_release(sk);
+ goto out;
+ }
+ }
+
+ if (!kern) {
+ err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+ if (err) {
+ sk_common_release(sk);
+ goto out;
+ }
}
out:
return err;
@@ -469,7 +479,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
snum = ntohs(addr->sin_port);
err = -EACCES;
- if (snum && snum < PROT_SOCK &&
+ if (snum && snum < inet_prot_sock(net) &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
goto out;
@@ -560,19 +570,30 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
* TCP 'magic' in here.
*/
int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
- int addr_len, int flags)
+ int addr_len, int flags, int is_sendmsg)
{
struct sock *sk = sock->sk;
int err;
long timeo;
- if (addr_len < sizeof(uaddr->sa_family))
- return -EINVAL;
+ /*
+ * uaddr can be NULL and addr_len can be 0 if:
+ * sk is a TCP fastopen active socket and
+ * TCP_FASTOPEN_CONNECT sockopt is set and
+ * we already have a valid cookie for this socket.
+ * In this case, user can call write() after connect().
+ * write() will invoke tcp_sendmsg_fastopen() which calls
+ * __inet_stream_connect().
+ */
+ if (uaddr) {
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
- if (uaddr->sa_family == AF_UNSPEC) {
- err = sk->sk_prot->disconnect(sk, flags);
- sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
- goto out;
+ if (uaddr->sa_family == AF_UNSPEC) {
+ err = sk->sk_prot->disconnect(sk, flags);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ goto out;
+ }
}
switch (sock->state) {
@@ -583,7 +604,10 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
err = -EISCONN;
goto out;
case SS_CONNECTING:
- err = -EALREADY;
+ if (inet_sk(sk)->defer_connect)
+ err = is_sendmsg ? -EINPROGRESS : -EISCONN;
+ else
+ err = -EALREADY;
/* Fall out of switch with err, set for this state */
break;
case SS_UNCONNECTED:
@@ -597,6 +621,9 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
sock->state = SS_CONNECTING;
+ if (!err && inet_sk(sk)->defer_connect)
+ goto out;
+
/* Just entered SS_CONNECTING state; the only
* difference is that return value in non-blocking
* case is EINPROGRESS, rather than EALREADY.
@@ -652,7 +679,7 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int err;
lock_sock(sock->sk);
- err = __inet_stream_connect(sock, uaddr, addr_len, flags);
+ err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
release_sock(sock->sk);
return err;
}
@@ -662,11 +689,12 @@ EXPORT_SYMBOL(inet_stream_connect);
* Accept a pending connection. The TCP layer now gives BSD semantics.
*/
-int inet_accept(struct socket *sock, struct socket *newsock, int flags)
+int inet_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern)
{
struct sock *sk1 = sock->sk;
int err = -EINVAL;
- struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
+ struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern);
if (!sk2)
goto do_err;
@@ -1396,7 +1424,7 @@ out_unlock:
rcu_read_unlock();
out:
- NAPI_GRO_CB(skb)->flush |= flush;
+ skb_gro_flush_final(skb, pp, flush);
return pp;
}
@@ -1460,8 +1488,10 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
int proto = iph->protocol;
int err = -ENOSYS;
- if (skb->encapsulation)
+ if (skb->encapsulation) {
+ skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP));
skb_set_inner_network_header(skb, nhoff);
+ }
csum_replace2(&iph->check, iph->tot_len, newlen);
iph->tot_len = newlen;
@@ -1690,6 +1720,9 @@ static __net_init int inet_init_net(struct net *net)
net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
net->ipv4.sysctl_ip_dynaddr = 0;
net->ipv4.sysctl_ip_early_demux = 1;
+#ifdef CONFIG_SYSCTL
+ net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
+#endif
return 0;
}
@@ -1821,8 +1854,6 @@ static int __init inet_init(void)
ip_init();
- tcp_v4_init();
-
/* Setup TCP slab cache for open requests. */
tcp_init();
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index f2a71025a770..22377c8ff14b 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -270,6 +270,9 @@ static void ah_input_done(struct crypto_async_request *base, int err)
int ihl = ip_hdrlen(skb);
int ah_hlen = (ah->hdrlen + 2) << 2;
+ if (err)
+ goto out;
+
work_iph = AH_SKB_CB(skb)->tmp;
auth_data = ah_tmp_auth(work_iph, ihl);
icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 89a8cac4726a..51b27ae09fbd 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1263,7 +1263,7 @@ void __init arp_init(void)
/*
* ax25 -> ASCII conversion
*/
-static char *ax2asc2(ax25_address *a, char *buf)
+static void ax2asc2(ax25_address *a, char *buf)
{
char c, *s;
int n;
@@ -1285,10 +1285,10 @@ static char *ax2asc2(ax25_address *a, char *buf)
*s++ = n + '0';
*s++ = '\0';
- if (*buf == '\0' || *buf == '-')
- return "*";
-
- return buf;
+ if (*buf == '\0' || *buf == '-') {
+ buf[0] = '*';
+ buf[1] = '\0';
+ }
}
#endif /* CONFIG_AX25 */
@@ -1322,7 +1322,7 @@ static void arp_format_neigh_entry(struct seq_file *seq,
}
#endif
sprintf(tbuf, "%pI4", n->primary_key);
- seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
+ seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s * %s\n",
tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
read_unlock(&n->lock);
}
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 72d6f056d863..ae206163c273 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1587,6 +1587,10 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
goto validate_return_locked;
}
+ if (opt_iter + 1 == opt_len) {
+ err_offset = opt_iter;
+ goto validate_return_locked;
+ }
tag_len = tag[1];
if (tag_len > (opt_len - opt_iter)) {
err_offset = opt_iter + 1;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 062a67ca9a21..cebedd545e5e 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -26,12 +26,13 @@
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
+#include <linux/sched/signal.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
@@ -65,8 +66,6 @@
#include <net/net_namespace.h>
#include <net/addrconf.h>
-#include "fib_lookup.h"
-
static struct ipv4_devconf ipv4_devconf = {
.data = {
[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 20fb25e3027b..b1e24446e297 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -18,6 +18,8 @@
#include <net/protocol.h>
#include <net/udp.h>
+#include <linux/highmem.h>
+
struct esp_skb_cb {
struct xfrm_skb_cb xfrm;
void *tmp;
@@ -92,11 +94,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
__alignof__(struct scatterlist));
}
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
+{
+ struct esp_output_extra *extra = esp_tmp_extra(tmp);
+ struct crypto_aead *aead = x->data;
+ int extralen = 0;
+ u8 *iv;
+ struct aead_request *req;
+ struct scatterlist *sg;
+
+ if (x->props.flags & XFRM_STATE_ESN)
+ extralen += sizeof(*extra);
+
+ extra = esp_tmp_extra(tmp);
+ iv = esp_tmp_iv(aead, tmp, extralen);
+ req = esp_tmp_req(aead, iv);
+
+ /* Unref skb_frag_pages in the src scatterlist if necessary.
+ * Skip the first sg which comes from skb->data.
+ */
+ if (req->src != req->dst)
+ for (sg = sg_next(req->src); sg; sg = sg_next(sg))
+ put_page(sg_page(sg));
+}
+
static void esp_output_done(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
+ void *tmp;
+ struct dst_entry *dst = skb_dst(skb);
+ struct xfrm_state *x = dst->xfrm;
- kfree(ESP_SKB_CB(skb)->tmp);
+ tmp = ESP_SKB_CB(skb)->tmp;
+ esp_ssg_unref(x, tmp);
+ kfree(tmp);
xfrm_output_resume(skb, err);
}
@@ -120,6 +151,29 @@ static void esp_output_restore_header(struct sk_buff *skb)
sizeof(__be32));
}
+static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb,
+ struct ip_esp_hdr *esph,
+ struct esp_output_extra *extra)
+{
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+
+ /* For ESN we move the header forward by 4 bytes to
+ * accomodate the high bits. We will move it back after
+ * encryption.
+ */
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ extra->esphoff = (unsigned char *)esph -
+ skb_transport_header(skb);
+ esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
+ extra->seqhi = esph->spi;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ }
+
+ esph->spi = x->id.spi;
+
+ return esph;
+}
+
static void esp_output_done_esn(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
@@ -128,18 +182,36 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
esp_output_done(base, err);
}
+static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
+{
+ /* Fill padding... */
+ if (tfclen) {
+ memset(tail, 0, tfclen);
+ tail += tfclen;
+ }
+ do {
+ int i;
+ for (i = 0; i < plen - 2; i++)
+ tail[i] = i + 1;
+ } while (0);
+ tail[plen - 2] = plen - 2;
+ tail[plen - 1] = proto;
+}
+
static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
{
- int err;
struct esp_output_extra *extra;
+ int err = -ENOMEM;
struct ip_esp_hdr *esph;
struct crypto_aead *aead;
struct aead_request *req;
- struct scatterlist *sg;
+ struct scatterlist *sg, *dsg;
struct sk_buff *trailer;
+ struct page *page;
void *tmp;
u8 *iv;
u8 *tail;
+ u8 *vaddr;
int blksize;
int clen;
int alen;
@@ -149,7 +221,9 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
int nfrags;
int assoclen;
int extralen;
+ int tailen;
__be64 seqno;
+ __u8 proto = *skb_mac_header(skb);
/* skb is pure payload to encrypt */
@@ -169,12 +243,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
clen = ALIGN(skb->len + 2 + tfclen, blksize);
plen = clen - skb->len - tfclen;
-
- err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
- if (err < 0)
- goto error;
- nfrags = err;
-
+ tailen = tfclen + plen + alen;
assoclen = sizeof(*esph);
extralen = 0;
@@ -183,35 +252,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
assoclen += sizeof(__be32);
}
- tmp = esp_alloc_tmp(aead, nfrags, extralen);
- if (!tmp) {
- err = -ENOMEM;
- goto error;
- }
-
- extra = esp_tmp_extra(tmp);
- iv = esp_tmp_iv(aead, tmp, extralen);
- req = esp_tmp_req(aead, iv);
- sg = esp_req_sg(aead, req);
-
- /* Fill padding... */
- tail = skb_tail_pointer(trailer);
- if (tfclen) {
- memset(tail, 0, tfclen);
- tail += tfclen;
- }
- do {
- int i;
- for (i = 0; i < plen - 2; i++)
- tail[i] = i + 1;
- } while (0);
- tail[plen - 2] = plen - 2;
- tail[plen - 1] = *skb_mac_header(skb);
- pskb_put(skb, trailer, clen - skb->len + alen);
-
- skb_push(skb, -skb_network_offset(skb));
- esph = ip_esp_hdr(skb);
*skb_mac_header(skb) = IPPROTO_ESP;
+ esph = ip_esp_hdr(skb);
/* this is non-NULL only with UDP Encapsulation */
if (x->encap) {
@@ -230,7 +272,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
uh = (struct udphdr *)esph;
uh->source = sport;
uh->dest = dport;
- uh->len = htons(skb->len - skb_transport_offset(skb));
+ uh->len = htons(skb->len + tailen
+ - skb_transport_offset(skb));
uh->check = 0;
switch (encap_type) {
@@ -248,31 +291,148 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
*skb_mac_header(skb) = IPPROTO_UDP;
}
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+ if (!skb_cloned(skb)) {
+ if (tailen <= skb_availroom(skb)) {
+ nfrags = 1;
+ trailer = skb;
+ tail = skb_tail_pointer(trailer);
- aead_request_set_callback(req, 0, esp_output_done, skb);
+ goto skip_cow;
+ } else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
+ && !skb_has_frag_list(skb)) {
+ int allocsize;
+ struct sock *sk = skb->sk;
+ struct page_frag *pfrag = &x->xfrag;
- /* For ESN we move the header forward by 4 bytes to
- * accomodate the high bits. We will move it back after
- * encryption.
- */
- if ((x->props.flags & XFRM_STATE_ESN)) {
- extra->esphoff = (unsigned char *)esph -
- skb_transport_header(skb);
- esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
- extra->seqhi = esph->spi;
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
- aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+ allocsize = ALIGN(tailen, L1_CACHE_BYTES);
+
+ spin_lock_bh(&x->lock);
+
+ if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+ spin_unlock_bh(&x->lock);
+ goto cow;
+ }
+
+ page = pfrag->page;
+ get_page(page);
+
+ vaddr = kmap_atomic(page);
+
+ tail = vaddr + pfrag->offset;
+
+ esp_output_fill_trailer(tail, tfclen, plen, proto);
+
+ kunmap_atomic(vaddr);
+
+ nfrags = skb_shinfo(skb)->nr_frags;
+
+ __skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
+ tailen);
+ skb_shinfo(skb)->nr_frags = ++nfrags;
+
+ pfrag->offset = pfrag->offset + allocsize;
+ nfrags++;
+
+ skb->len += tailen;
+ skb->data_len += tailen;
+ skb->truesize += tailen;
+ if (sk)
+ atomic_add(tailen, &sk->sk_wmem_alloc);
+
+ skb_push(skb, -skb_network_offset(skb));
+
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+ esph->spi = x->id.spi;
+
+ tmp = esp_alloc_tmp(aead, nfrags + 2, extralen);
+ if (!tmp) {
+ spin_unlock_bh(&x->lock);
+ err = -ENOMEM;
+ goto error;
+ }
+
+ extra = esp_tmp_extra(tmp);
+ iv = esp_tmp_iv(aead, tmp, extralen);
+ req = esp_tmp_req(aead, iv);
+ sg = esp_req_sg(aead, req);
+ dsg = &sg[nfrags];
+
+ esph = esp_output_set_extra(skb, esph, extra);
+
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg,
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + clen + alen);
+
+ allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
+
+ if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+ spin_unlock_bh(&x->lock);
+ err = -ENOMEM;
+ goto error;
+ }
+
+ skb_shinfo(skb)->nr_frags = 1;
+
+ page = pfrag->page;
+ get_page(page);
+ /* replace page frags in skb with new page */
+ __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
+ pfrag->offset = pfrag->offset + allocsize;
+
+ sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
+ skb_to_sgvec(skb, dsg,
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + clen + alen);
+
+ spin_unlock_bh(&x->lock);
+
+ goto skip_cow2;
+ }
}
+cow:
+ err = skb_cow_data(skb, tailen, &trailer);
+ if (err < 0)
+ goto error;
+ nfrags = err;
+ tail = skb_tail_pointer(trailer);
+ esph = ip_esp_hdr(skb);
+
+skip_cow:
+ esp_output_fill_trailer(tail, tfclen, plen, proto);
+
+ pskb_put(skb, trailer, clen - skb->len + alen);
+ skb_push(skb, -skb_network_offset(skb));
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
esph->spi = x->id.spi;
+ tmp = esp_alloc_tmp(aead, nfrags, extralen);
+ if (!tmp) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ extra = esp_tmp_extra(tmp);
+ iv = esp_tmp_iv(aead, tmp, extralen);
+ req = esp_tmp_req(aead, iv);
+ sg = esp_req_sg(aead, req);
+ dsg = sg;
+
+ esph = esp_output_set_extra(skb, esph, extra);
+
sg_init_table(sg, nfrags);
skb_to_sgvec(skb, sg,
(unsigned char *)esph - skb->data,
assoclen + ivlen + clen + alen);
- aead_request_set_crypt(req, sg, sg, ivlen + clen, iv);
+skip_cow2:
+ if ((x->props.flags & XFRM_STATE_ESN))
+ aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+ else
+ aead_request_set_callback(req, 0, esp_output_done, skb);
+
+ aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv);
aead_request_set_ad(req, assoclen);
seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
@@ -298,6 +458,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
esp_output_restore_header(skb);
}
+ if (sg != dsg)
+ esp_ssg_unref(x, tmp);
kfree(tmp);
error:
@@ -401,6 +563,23 @@ static void esp_input_restore_header(struct sk_buff *skb)
__skb_pull(skb, 4);
}
+static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
+{
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data;
+
+ /* For ESN we move the header forward by 4 bytes to
+ * accomodate the high bits. We will move it back after
+ * decryption.
+ */
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ esph = (void *)skb_push(skb, 4);
+ *seqhi = esph->spi;
+ esph->spi = esph->seq_no;
+ esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+ }
+}
+
static void esp_input_done_esn(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
@@ -437,12 +616,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
if (elen <= 0)
goto out;
- err = skb_cow_data(skb, 0, &trailer);
- if (err < 0)
- goto out;
-
- nfrags = err;
-
assoclen = sizeof(*esph);
seqhilen = 0;
@@ -451,6 +624,26 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
assoclen += seqhilen;
}
+ if (!skb_cloned(skb)) {
+ if (!skb_is_nonlinear(skb)) {
+ nfrags = 1;
+
+ goto skip_cow;
+ } else if (!skb_has_frag_list(skb)) {
+ nfrags = skb_shinfo(skb)->nr_frags;
+ nfrags++;
+
+ goto skip_cow;
+ }
+ }
+
+ err = skb_cow_data(skb, 0, &trailer);
+ if (err < 0)
+ goto out;
+
+ nfrags = err;
+
+skip_cow:
err = -ENOMEM;
tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
if (!tmp)
@@ -462,26 +655,17 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
req = esp_tmp_req(aead, iv);
sg = esp_req_sg(aead, req);
- skb->ip_summed = CHECKSUM_NONE;
+ esp_input_set_header(skb, seqhi);
- esph = (struct ip_esp_hdr *)skb->data;
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg, 0, skb->len);
- aead_request_set_callback(req, 0, esp_input_done, skb);
+ skb->ip_summed = CHECKSUM_NONE;
- /* For ESN we move the header forward by 4 bytes to
- * accomodate the high bits. We will move it back after
- * decryption.
- */
- if ((x->props.flags & XFRM_STATE_ESN)) {
- esph = (void *)skb_push(skb, 4);
- *seqhi = esph->spi;
- esph->spi = esph->seq_no;
- esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+ if ((x->props.flags & XFRM_STATE_ESN))
aead_request_set_callback(req, 0, esp_input_done_esn, skb);
- }
-
- sg_init_table(sg, nfrags);
- skb_to_sgvec(skb, sg, 0, skb->len);
+ else
+ aead_request_set_callback(req, 0, esp_input_done, skb);
aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
aead_request_set_ad(req, assoclen);
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
new file mode 100644
index 000000000000..1de442632406
--- /dev/null
+++ b/net/ipv4/esp4_offload.c
@@ -0,0 +1,106 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * Copyright (C) 2016 secunet Security Networks AG
+ * Author: Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * ESP GRO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/udp.h>
+
+static struct sk_buff **esp4_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ int offset = skb_gro_offset(skb);
+ struct xfrm_offload *xo;
+ struct xfrm_state *x;
+ __be32 seq;
+ __be32 spi;
+ int err;
+
+ skb_pull(skb, offset);
+
+ if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0)
+ goto out;
+
+ err = secpath_set(skb);
+ if (err)
+ goto out;
+
+ if (skb->sp->len == XFRM_MAX_DEPTH)
+ goto out;
+
+ x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
+ (xfrm_address_t *)&ip_hdr(skb)->daddr,
+ spi, IPPROTO_ESP, AF_INET);
+ if (!x)
+ goto out;
+
+ skb->sp->xvec[skb->sp->len++] = x;
+ skb->sp->olen++;
+
+ xo = xfrm_offload(skb);
+ if (!xo) {
+ xfrm_state_put(x);
+ goto out;
+ }
+ xo->flags |= XFRM_GRO;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+ XFRM_SPI_SKB_CB(skb)->seq = seq;
+
+ /* We don't need to handle errors from xfrm_input, it does all
+ * the error handling and frees the resources on error. */
+ xfrm_input(skb, IPPROTO_ESP, spi, -2);
+
+ return ERR_PTR(-EINPROGRESS);
+out:
+ skb_push(skb, offset);
+ NAPI_GRO_CB(skb)->same_flow = 0;
+ NAPI_GRO_CB(skb)->flush = 1;
+
+ return NULL;
+}
+
+static const struct net_offload esp4_offload = {
+ .callbacks = {
+ .gro_receive = esp4_gro_receive,
+ },
+};
+
+static int __init esp4_offload_init(void)
+{
+ return inet_add_offload(&esp4_offload, IPPROTO_ESP);
+}
+
+static void __exit esp4_offload_exit(void)
+{
+ inet_del_offload(&esp4_offload, IPPROTO_ESP);
+}
+
+module_init(esp4_offload_init);
+module_exit(esp4_offload_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 161fc0f0d752..8f2133ffc2ff 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -14,7 +14,7 @@
*/
#include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/types.h>
@@ -46,6 +46,7 @@
#include <net/rtnetlink.h>
#include <net/xfrm.h>
#include <net/l3mdev.h>
+#include <net/lwtunnel.h>
#include <trace/events/fib.h>
#ifndef CONFIG_IP_MULTIPLE_TABLES
@@ -85,7 +86,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
if (tb)
return tb;
- if (id == RT_TABLE_LOCAL)
+ if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules)
alias = fib_new_table(net, RT_TABLE_MAIN);
tb = fib_trie_table(id, alias);
@@ -318,7 +319,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
int ret, no_addr;
struct fib_result res;
struct flowi4 fl4;
- struct net *net;
+ struct net *net = dev_net(dev);
bool dev_match;
fl4.flowi4_oif = 0;
@@ -331,6 +332,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_tun_key.tun_id = 0;
fl4.flowi4_flags = 0;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
no_addr = idev->ifa_list == NULL;
@@ -338,13 +340,12 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
trace_fib_validate_source(dev, &fl4);
- net = dev_net(dev);
if (fib_lookup(net, &fl4, &res, 0))
goto last_resort;
if (res.type != RTN_UNICAST &&
(res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
goto e_inval;
- if (!rpf && !fib_num_tclassid_users(dev_net(dev)) &&
+ if (!rpf && !fib_num_tclassid_users(net) &&
(dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
goto last_resort;
fib_combine_itag(itag, &res);
@@ -620,6 +621,8 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_FLOW] = { .type = NLA_U32 },
[RTA_ENCAP_TYPE] = { .type = NLA_U16 },
[RTA_ENCAP] = { .type = NLA_NESTED },
+ [RTA_UID] = { .type = NLA_U32 },
+ [RTA_MARK] = { .type = NLA_U32 },
};
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
@@ -676,6 +679,10 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
cfg->fc_mx_len = nla_len(attr);
break;
case RTA_MULTIPATH:
+ err = lwtunnel_valid_encap_type_attr(nla_data(attr),
+ nla_len(attr));
+ if (err < 0)
+ goto errout;
cfg->fc_mp = nla_data(attr);
cfg->fc_mp_len = nla_len(attr);
break;
@@ -690,6 +697,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
break;
case RTA_ENCAP_TYPE:
cfg->fc_encap_type = nla_get_u16(attr);
+ err = lwtunnel_valid_encap_type(cfg->fc_encap_type);
+ if (err < 0)
+ goto errout;
break;
}
}
@@ -1073,7 +1083,8 @@ static void nl_fib_input(struct sk_buff *skb)
net = sock_net(skb->sk);
nlh = nlmsg_hdr(skb);
- if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len ||
+ if (skb->len < nlmsg_total_size(sizeof(*frn)) ||
+ skb->len < nlh->nlmsg_len ||
nlmsg_len(nlh) < sizeof(*frn))
return;
@@ -1218,6 +1229,8 @@ static int __net_init ip_fib_net_init(struct net *net)
int err;
size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
+ net->ipv4.fib_seq = 0;
+
/* Avoid false sharing : Use at least a full cache line */
size = max_t(size_t, size, L1_CACHE_BYTES);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 388d3e21629b..317026a39cfa 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -13,7 +13,7 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -234,6 +234,7 @@ void free_fib_info(struct fib_info *fi)
#endif
call_rcu(&fi->rcu, free_fib_info_rcu);
}
+EXPORT_SYMBOL_GPL(free_fib_info);
void fib_release_info(struct fib_info *fi)
{
@@ -470,7 +471,6 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
int remaining, struct fib_config *cfg)
{
- struct net *net = cfg->fc_nlinfo.nl_net;
int ret;
change_nexthops(fi) {
@@ -502,16 +502,14 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
nla = nla_find(attrs, attrlen, RTA_ENCAP);
if (nla) {
struct lwtunnel_state *lwtstate;
- struct net_device *dev = NULL;
struct nlattr *nla_entype;
nla_entype = nla_find(attrs, attrlen,
RTA_ENCAP_TYPE);
if (!nla_entype)
goto err_inval;
- if (cfg->fc_oif)
- dev = __dev_get_by_index(net, cfg->fc_oif);
- ret = lwtunnel_build_state(dev, nla_get_u16(
+
+ ret = lwtunnel_build_state(nla_get_u16(
nla_entype),
nla, AF_INET, cfg,
&lwtstate);
@@ -596,21 +594,18 @@ static inline void fib_add_weight(struct fib_info *fi,
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
-static int fib_encap_match(struct net *net, u16 encap_type,
+static int fib_encap_match(u16 encap_type,
struct nlattr *encap,
- int oif, const struct fib_nh *nh,
+ const struct fib_nh *nh,
const struct fib_config *cfg)
{
struct lwtunnel_state *lwtstate;
- struct net_device *dev = NULL;
int ret, result = 0;
if (encap_type == LWTUNNEL_ENCAP_NONE)
return 0;
- if (oif)
- dev = __dev_get_by_index(net, oif);
- ret = lwtunnel_build_state(dev, encap_type, encap,
+ ret = lwtunnel_build_state(encap_type, encap,
AF_INET, cfg, &lwtstate);
if (!ret) {
result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
@@ -622,7 +617,6 @@ static int fib_encap_match(struct net *net, u16 encap_type,
int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
{
- struct net *net = cfg->fc_nlinfo.nl_net;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
struct rtnexthop *rtnh;
int remaining;
@@ -633,9 +627,8 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
if (cfg->fc_oif || cfg->fc_gw) {
if (cfg->fc_encap) {
- if (fib_encap_match(net, cfg->fc_encap_type,
- cfg->fc_encap, cfg->fc_oif,
- fi->fib_nh, cfg))
+ if (fib_encap_match(cfg->fc_encap_type,
+ cfg->fc_encap, fi->fib_nh, cfg))
return 1;
}
if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
@@ -1092,13 +1085,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (cfg->fc_encap) {
struct lwtunnel_state *lwtstate;
- struct net_device *dev = NULL;
if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
goto err_inval;
- if (cfg->fc_oif)
- dev = __dev_get_by_index(net, cfg->fc_oif);
- err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+ err = lwtunnel_build_state(cfg->fc_encap_type,
cfg->fc_encap, AF_INET, cfg,
&lwtstate);
if (err)
@@ -1278,8 +1268,9 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
goto nla_put_failure;
#endif
- if (fi->fib_nh->nh_lwtstate)
- lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate);
+ if (fi->fib_nh->nh_lwtstate &&
+ lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate) < 0)
+ goto nla_put_failure;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (fi->fib_nhs > 1) {
@@ -1315,8 +1306,10 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
goto nla_put_failure;
#endif
- if (nh->nh_lwtstate)
- lwtunnel_fill_encap(skb, nh->nh_lwtstate);
+ if (nh->nh_lwtstate &&
+ lwtunnel_fill_encap(skb, nh->nh_lwtstate) < 0)
+ goto nla_put_failure;
+
/* length of rtnetlink header + attributes */
rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
} endfor_nexthops(fi);
@@ -1362,6 +1355,36 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local)
return ret;
}
+static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
+ enum fib_event_type event_type)
+{
+ struct in_device *in_dev = __in_dev_get_rtnl(fib_nh->nh_dev);
+ struct fib_nh_notifier_info info = {
+ .fib_nh = fib_nh,
+ };
+
+ switch (event_type) {
+ case FIB_EVENT_NH_ADD:
+ if (fib_nh->nh_flags & RTNH_F_DEAD)
+ break;
+ if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ fib_nh->nh_flags & RTNH_F_LINKDOWN)
+ break;
+ return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type,
+ &info.info);
+ case FIB_EVENT_NH_DEL:
+ if ((IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ fib_nh->nh_flags & RTNH_F_LINKDOWN) ||
+ (fib_nh->nh_flags & RTNH_F_DEAD))
+ return call_fib_notifiers(dev_net(fib_nh->nh_dev),
+ event_type, &info.info);
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
/* Event force Flags Description
* NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host
* NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host
@@ -1403,6 +1426,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
break;
}
+ call_fib_nh_notifiers(nexthop_nh,
+ FIB_EVENT_NH_DEL);
dead++;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1433,7 +1458,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
}
/* Must be invoked inside of an RCU protected region. */
-void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
+static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
{
struct fib_info *fi = NULL, *last_resort = NULL;
struct hlist_head *fa_head = res->fa_head;
@@ -1557,6 +1582,7 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
continue;
alive++;
nexthop_nh->nh_flags &= ~nh_flags;
+ call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD);
} endfor_nexthops(fi)
if (alive > 0) {
@@ -1617,8 +1643,13 @@ void fib_select_multipath(struct fib_result *res, int hash)
void fib_select_path(struct net *net, struct fib_result *res,
struct flowi4 *fl4, int mp_hash)
{
+ bool oif_check;
+
+ oif_check = (fl4->flowi4_oif == 0 ||
+ fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF);
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) {
+ if (res->fi->fib_nhs > 1 && oif_check) {
if (mp_hash < 0)
mp_hash = get_hash_from_flowi4(fl4) >> 1;
@@ -1628,7 +1659,7 @@ void fib_select_path(struct net *net, struct fib_result *res,
#endif
if (!res->prefixlen &&
res->table->tb_num_default > 1 &&
- res->type == RTN_UNICAST && !fl4->flowi4_oif)
+ res->type == RTN_UNICAST && oif_check)
fib_select_default(fl4, res);
if (!fl4->saddr)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index e3665bf7a7f3..2f0d8233950f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -50,7 +50,7 @@
#define VERSION "0.409"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -84,31 +84,119 @@
#include <trace/events/fib.h>
#include "fib_lookup.h"
-static BLOCKING_NOTIFIER_HEAD(fib_chain);
+static unsigned int fib_seq_sum(void)
+{
+ unsigned int fib_seq = 0;
+ struct net *net;
+
+ rtnl_lock();
+ for_each_net(net)
+ fib_seq += net->ipv4.fib_seq;
+ rtnl_unlock();
+
+ return fib_seq;
+}
+
+static ATOMIC_NOTIFIER_HEAD(fib_chain);
+
+static int call_fib_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ info->net = net;
+ return nb->notifier_call(nb, event_type, info);
+}
+
+static void fib_rules_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ struct fib_notifier_info info;
+
+ if (net->ipv4.fib_has_custom_rules)
+ call_fib_notifier(nb, net, event_type, &info);
+#endif
+}
+
+static void fib_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type);
+
+static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type, u32 dst,
+ int dst_len, struct fib_info *fi,
+ u8 tos, u8 type, u32 tb_id)
+{
+ struct fib_entry_notifier_info info = {
+ .dst = dst,
+ .dst_len = dst_len,
+ .fi = fi,
+ .tos = tos,
+ .type = type,
+ .tb_id = tb_id,
+ };
+ return call_fib_notifier(nb, net, event_type, &info.info);
+}
-int register_fib_notifier(struct notifier_block *nb)
+static bool fib_dump_is_consistent(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb),
+ unsigned int fib_seq)
{
- return blocking_notifier_chain_register(&fib_chain, nb);
+ atomic_notifier_chain_register(&fib_chain, nb);
+ if (fib_seq == fib_seq_sum())
+ return true;
+ atomic_notifier_chain_unregister(&fib_chain, nb);
+ if (cb)
+ cb(nb);
+ return false;
+}
+
+#define FIB_DUMP_MAX_RETRIES 5
+int register_fib_notifier(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb))
+{
+ int retries = 0;
+
+ do {
+ unsigned int fib_seq = fib_seq_sum();
+ struct net *net;
+
+ /* Mutex semantics guarantee that every change done to
+ * FIB tries before we read the change sequence counter
+ * is now visible to us.
+ */
+ rcu_read_lock();
+ for_each_net_rcu(net) {
+ fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD);
+ fib_notify(net, nb, FIB_EVENT_ENTRY_ADD);
+ }
+ rcu_read_unlock();
+
+ if (fib_dump_is_consistent(nb, cb, fib_seq))
+ return 0;
+ } while (++retries < FIB_DUMP_MAX_RETRIES);
+
+ return -EBUSY;
}
EXPORT_SYMBOL(register_fib_notifier);
int unregister_fib_notifier(struct notifier_block *nb)
{
- return blocking_notifier_chain_unregister(&fib_chain, nb);
+ return atomic_notifier_chain_unregister(&fib_chain, nb);
}
EXPORT_SYMBOL(unregister_fib_notifier);
int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
struct fib_notifier_info *info)
{
+ net->ipv4.fib_seq++;
info->net = net;
- return blocking_notifier_call_chain(&fib_chain, event_type, info);
+ return atomic_notifier_call_chain(&fib_chain, event_type, info);
}
static int call_fib_entry_notifiers(struct net *net,
enum fib_event_type event_type, u32 dst,
int dst_len, struct fib_info *fi,
- u8 tos, u8 type, u32 tb_id, u32 nlflags)
+ u8 tos, u8 type, u32 tb_id)
{
struct fib_entry_notifier_info info = {
.dst = dst,
@@ -117,7 +205,6 @@ static int call_fib_entry_notifiers(struct net *net,
.tos = tos,
.type = type,
.tb_id = tb_id,
- .nlflags = nlflags,
};
return call_fib_notifiers(net, event_type, &info.info);
}
@@ -1109,6 +1196,7 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
int fib_table_insert(struct net *net, struct fib_table *tb,
struct fib_config *cfg)
{
+ enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
struct key_vector *l, *tp;
@@ -1206,6 +1294,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
+ key, plen, fi,
+ new_fa->fa_tos, cfg->fc_type,
+ tb->tb_id);
+ rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
+ tb->tb_id, &cfg->fc_nlinfo, nlflags);
+
hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
alias_free_mem_rcu(fa);
@@ -1214,13 +1309,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
- call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
- key, plen, fi,
- new_fa->fa_tos, cfg->fc_type,
- tb->tb_id, cfg->fc_nlflags);
- rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
- tb->tb_id, &cfg->fc_nlinfo, nlflags);
-
goto succeeded;
}
/* Error if we find a perfect match which
@@ -1230,10 +1318,12 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
if (fa_match)
goto out;
- if (cfg->fc_nlflags & NLM_F_APPEND)
+ if (cfg->fc_nlflags & NLM_F_APPEND) {
+ event = FIB_EVENT_ENTRY_APPEND;
nlflags |= NLM_F_APPEND;
- else
+ } else {
fa = fa_first;
+ }
}
err = -ENOENT;
if (!(cfg->fc_nlflags & NLM_F_CREATE))
@@ -1262,8 +1352,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
tb->tb_num_default++;
rt_cache_flush(cfg->fc_nlinfo.nl_net);
- call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos,
- cfg->fc_type, tb->tb_id, cfg->fc_nlflags);
+ call_fib_entry_notifiers(net, event, key, plen, fi, tos, cfg->fc_type,
+ tb->tb_id);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
&cfg->fc_nlinfo, nlflags);
succeeded:
@@ -1564,8 +1654,8 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
return -ESRCH;
call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
- fa_to_delete->fa_info, tos, cfg->fc_type,
- tb->tb_id, 0);
+ fa_to_delete->fa_info, tos,
+ fa_to_delete->fa_type, tb->tb_id);
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
@@ -1874,7 +1964,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
struct fib_info *fi = fa->fa_info;
- if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) {
+ if (!fi || !(fi->fib_flags & RTNH_F_DEAD) ||
+ tb->tb_id != fa->tb_id) {
slen = fa->fa_slen;
continue;
}
@@ -1883,7 +1974,7 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
n->key,
KEYLENGTH - fa->fa_slen,
fi, fa->fa_tos, fa->fa_type,
- tb->tb_id, 0);
+ tb->tb_id);
hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
@@ -1903,6 +1994,62 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
return found;
}
+static void fib_leaf_notify(struct net *net, struct key_vector *l,
+ struct fib_table *tb, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ struct fib_alias *fa;
+
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (!fi)
+ continue;
+
+ /* local and main table can share the same trie,
+ * so don't notify twice for the same entry.
+ */
+ if (tb->tb_id != fa->tb_id)
+ continue;
+
+ call_fib_entry_notifier(nb, net, event_type, l->key,
+ KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
+ fa->fa_type, fa->tb_id);
+ }
+}
+
+static void fib_table_notify(struct net *net, struct fib_table *tb,
+ struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *l, *tp = t->kv;
+ t_key key = 0;
+
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+ fib_leaf_notify(net, l, tb, nb, event_type);
+
+ key = l->key + 1;
+ /* stop in case of wrap around */
+ if (key < l->key)
+ break;
+ }
+}
+
+static void fib_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb_hlist)
+ fib_table_notify(net, tb, nb, event_type);
+ }
+}
+
static void __trie_free_rcu(struct rcu_head *head)
{
struct fib_table *tb = container_of(head, struct fib_table, rcu);
@@ -2241,7 +2388,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,
"Basic info: size of leaf:"
- " %Zd bytes, size of tnode: %Zd bytes.\n",
+ " %zd bytes, size of tnode: %zd bytes.\n",
LEAF_SIZE, TNODE_SIZE(0));
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 030d1531e897..805f6607f8d9 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -622,14 +622,7 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg)
return err;
}
-static struct genl_family fou_nl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = FOU_GENL_NAME,
- .version = FOU_GENL_VERSION,
- .maxattr = FOU_ATTR_MAX,
- .netnsok = true,
-};
+static struct genl_family fou_nl_family;
static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
[FOU_ATTR_PORT] = { .type = NLA_U16, },
@@ -831,6 +824,17 @@ static const struct genl_ops fou_nl_ops[] = {
},
};
+static struct genl_family fou_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = FOU_GENL_NAME,
+ .version = FOU_GENL_VERSION,
+ .maxattr = FOU_ATTR_MAX,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = fou_nl_ops,
+ .n_ops = ARRAY_SIZE(fou_nl_ops),
+};
+
size_t fou_encap_hlen(struct ip_tunnel_encap *e)
{
return sizeof(struct udphdr);
@@ -1086,8 +1090,7 @@ static int __init fou_init(void)
if (ret)
goto exit;
- ret = genl_register_family_with_ops(&fou_nl_family,
- fou_nl_ops);
+ ret = genl_register_family(&fou_nl_family);
if (ret < 0)
goto unregister;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 48734ee6293f..fc310db2708b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -91,7 +91,7 @@
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/checksum.h>
#include <net/xfrm.h>
#include <net/inet_common.h>
@@ -209,19 +209,17 @@ static struct sock *icmp_sk(struct net *net)
return *this_cpu_ptr(net->ipv4.icmp_sk);
}
+/* Called with BH disabled */
static inline struct sock *icmp_xmit_lock(struct net *net)
{
struct sock *sk;
- local_bh_disable();
-
sk = icmp_sk(net);
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path signals a
* dst_link_failure() for an outgoing ICMP packet.
*/
- local_bh_enable();
return NULL;
}
return sk;
@@ -229,7 +227,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
static inline void icmp_xmit_unlock(struct sock *sk)
{
- spin_unlock_bh(&sk->sk_lock.slock);
+ spin_unlock(&sk->sk_lock.slock);
}
int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
@@ -282,6 +280,33 @@ bool icmp_global_allow(void)
}
EXPORT_SYMBOL(icmp_global_allow);
+static bool icmpv4_mask_allow(struct net *net, int type, int code)
+{
+ if (type > NR_ICMP_TYPES)
+ return true;
+
+ /* Don't limit PMTU discovery. */
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ return true;
+
+ /* Limit if icmp type is enabled in ratemask. */
+ if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
+ return true;
+
+ return false;
+}
+
+static bool icmpv4_global_allow(struct net *net, int type, int code)
+{
+ if (icmpv4_mask_allow(net, type, code))
+ return true;
+
+ if (icmp_global_allow())
+ return true;
+
+ return false;
+}
+
/*
* Send an ICMP frame.
*/
@@ -290,34 +315,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
struct flowi4 *fl4, int type, int code)
{
struct dst_entry *dst = &rt->dst;
+ struct inet_peer *peer;
bool rc = true;
+ int vif;
- if (type > NR_ICMP_TYPES)
- goto out;
-
- /* Don't limit PMTU discovery. */
- if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ if (icmpv4_mask_allow(net, type, code))
goto out;
/* No rate limit on loopback */
if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
goto out;
- /* Limit if icmp type is enabled in ratemask. */
- if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
- goto out;
-
- rc = false;
- if (icmp_global_allow()) {
- int vif = l3mdev_master_ifindex(dst->dev);
- struct inet_peer *peer;
-
- peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
- rc = inet_peer_xrlim_allow(peer,
- net->ipv4.sysctl_icmp_ratelimit);
- if (peer)
- inet_putpeer(peer);
- }
+ vif = l3mdev_master_ifindex(dst->dev);
+ peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
+ rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
+ if (peer)
+ inet_putpeer(peer);
out:
return rc;
}
@@ -396,13 +409,22 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
struct inet_sock *inet;
__be32 daddr, saddr;
u32 mark = IP4_REPLY_MARK(net, skb->mark);
+ int type = icmp_param->data.icmph.type;
+ int code = icmp_param->data.icmph.code;
if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
return;
+ /* Needed by both icmp_global_allow and icmp_xmit_lock */
+ local_bh_disable();
+
+ /* global icmp_msgs_per_sec */
+ if (!icmpv4_global_allow(net, type, code))
+ goto out_bh_enable;
+
sk = icmp_xmit_lock(net);
if (!sk)
- return;
+ goto out_bh_enable;
inet = inet_sk(sk);
icmp_param->data.icmph.checksum = 0;
@@ -425,6 +447,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.daddr = daddr;
fl4.saddr = saddr;
fl4.flowi4_mark = mark;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_proto = IPPROTO_ICMP;
fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
@@ -432,12 +455,13 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
goto out_unlock;
- if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
- icmp_param->data.icmph.code))
+ if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
+out_bh_enable:
+ local_bh_enable();
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -473,6 +497,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
param->replyopts.opt.opt.faddr : iph->saddr);
fl4->saddr = saddr;
fl4->flowi4_mark = mark;
+ fl4->flowi4_uid = sock_net_uid(net, NULL);
fl4->flowi4_tos = RT_TOS(tos);
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
@@ -569,7 +594,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
struct iphdr *iph;
int room;
- struct icmp_bxm *icmp_param;
+ struct icmp_bxm icmp_param;
struct rtable *rt = skb_rtable(skb_in);
struct ipcm_cookie ipc;
struct flowi4 fl4;
@@ -646,13 +671,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
}
}
- icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
- if (!icmp_param)
- return;
+ /* Needed by both icmp_global_allow and icmp_xmit_lock */
+ local_bh_disable();
+
+ /* Check global sysctl_icmp_msgs_per_sec ratelimit */
+ if (!icmpv4_global_allow(net, type, code))
+ goto out_bh_enable;
sk = icmp_xmit_lock(net);
if (!sk)
- goto out_free;
+ goto out_bh_enable;
/*
* Construct source address and options.
@@ -679,7 +707,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark);
- if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
+ if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
goto out_unlock;
@@ -687,25 +715,26 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
* Prepare data for ICMP header.
*/
- icmp_param->data.icmph.type = type;
- icmp_param->data.icmph.code = code;
- icmp_param->data.icmph.un.gateway = info;
- icmp_param->data.icmph.checksum = 0;
- icmp_param->skb = skb_in;
- icmp_param->offset = skb_network_offset(skb_in);
+ icmp_param.data.icmph.type = type;
+ icmp_param.data.icmph.code = code;
+ icmp_param.data.icmph.un.gateway = info;
+ icmp_param.data.icmph.checksum = 0;
+ icmp_param.skb = skb_in;
+ icmp_param.offset = skb_network_offset(skb_in);
inet_sk(sk)->tos = tos;
sk->sk_mark = mark;
ipc.addr = iph->saddr;
- ipc.opt = &icmp_param->replyopts.opt;
+ ipc.opt = &icmp_param.replyopts.opt;
ipc.tx_flags = 0;
ipc.ttl = 0;
ipc.tos = -1;
rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
- type, code, icmp_param);
+ type, code, &icmp_param);
if (IS_ERR(rt))
goto out_unlock;
+ /* peer icmp_ratelimit */
if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
goto ende;
@@ -714,21 +743,21 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
room = dst_mtu(&rt->dst);
if (room > 576)
room = 576;
- room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen;
+ room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
room -= sizeof(struct icmphdr);
- icmp_param->data_len = skb_in->len - icmp_param->offset;
- if (icmp_param->data_len > room)
- icmp_param->data_len = room;
- icmp_param->head_len = sizeof(struct icmphdr);
+ icmp_param.data_len = skb_in->len - icmp_param.offset;
+ if (icmp_param.data_len > room)
+ icmp_param.data_len = room;
+ icmp_param.head_len = sizeof(struct icmphdr);
- icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
+ icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
ende:
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
-out_free:
- kfree(icmp_param);
+out_bh_enable:
+ local_bh_enable();
out:;
}
EXPORT_SYMBOL(icmp_send);
@@ -1045,12 +1074,12 @@ int icmp_rcv(struct sk_buff *skb)
if (success) {
consume_skb(skb);
- return 0;
+ return NET_RX_SUCCESS;
}
drop:
kfree_skb(skb);
- return 0;
+ return NET_RX_DROP;
csum_error:
__ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS);
error:
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 15db786d50ed..44fd86de2823 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -72,7 +72,7 @@
#include <linux/module.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
@@ -219,9 +219,14 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
static void igmp_gq_start_timer(struct in_device *in_dev)
{
int tv = prandom_u32() % in_dev->mr_maxdelay;
+ unsigned long exp = jiffies + tv + 2;
+
+ if (in_dev->mr_gq_running &&
+ time_after_eq(exp, (in_dev->mr_gq_timer).expires))
+ return;
in_dev->mr_gq_running = 1;
- if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2))
+ if (!mod_timer(&in_dev->mr_gq_timer, exp))
in_dev_hold(in_dev);
}
@@ -1167,6 +1172,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
psf->sf_crcount = im->crcount;
}
in_dev_put(pmc->interface);
+ kfree(pmc);
}
spin_unlock_bh(&im->lock);
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 61a9deec2993..5e313c1ac94f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -31,6 +31,86 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
EXPORT_SYMBOL(inet_csk_timer_bug_msg);
#endif
+#if IS_ENABLED(CONFIG_IPV6)
+/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
+ * only, and any IPv4 addresses if not IPv6 only
+ * match_wildcard == false: addresses must be exactly the same, i.e.
+ * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
+ * and 0.0.0.0 equals to 0.0.0.0 only
+ */
+static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
+ const struct in6_addr *sk2_rcv_saddr6,
+ __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+ bool sk1_ipv6only, bool sk2_ipv6only,
+ bool match_wildcard)
+{
+ int addr_type = ipv6_addr_type(sk1_rcv_saddr6);
+ int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
+
+ /* if both are mapped, treat as IPv4 */
+ if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
+ if (!sk2_ipv6only) {
+ if (sk1_rcv_saddr == sk2_rcv_saddr)
+ return 1;
+ if (!sk1_rcv_saddr || !sk2_rcv_saddr)
+ return match_wildcard;
+ }
+ return 0;
+ }
+
+ if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
+ return 1;
+
+ if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
+ !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
+ return 1;
+
+ if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
+ !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
+ return 1;
+
+ if (sk2_rcv_saddr6 &&
+ ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6))
+ return 1;
+
+ return 0;
+}
+#endif
+
+/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
+ * match_wildcard == false: addresses must be exactly the same, i.e.
+ * 0.0.0.0 only equals to 0.0.0.0
+ */
+static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+ bool sk2_ipv6only, bool match_wildcard)
+{
+ if (!sk2_ipv6only) {
+ if (sk1_rcv_saddr == sk2_rcv_saddr)
+ return 1;
+ if (!sk1_rcv_saddr || !sk2_rcv_saddr)
+ return match_wildcard;
+ }
+ return 0;
+}
+
+int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+ bool match_wildcard)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr,
+ inet6_rcv_saddr(sk2),
+ sk->sk_rcv_saddr,
+ sk2->sk_rcv_saddr,
+ ipv6_only_sock(sk),
+ ipv6_only_sock(sk2),
+ match_wildcard);
+#endif
+ return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr,
+ ipv6_only_sock(sk2), match_wildcard);
+}
+EXPORT_SYMBOL(inet_rcv_saddr_equal);
+
void inet_get_local_port_range(struct net *net, int *low, int *high)
{
unsigned int seq;
@@ -44,12 +124,13 @@ void inet_get_local_port_range(struct net *net, int *low, int *high)
}
EXPORT_SYMBOL(inet_get_local_port_range);
-int inet_csk_bind_conflict(const struct sock *sk,
- const struct inet_bind_bucket *tb, bool relax)
+static int inet_csk_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb,
+ bool relax, bool reuseport_ok)
{
struct sock *sk2;
- int reuse = sk->sk_reuse;
- int reuseport = sk->sk_reuseport;
+ bool reuse = sk->sk_reuse;
+ bool reuseport = !!sk->sk_reuseport && reuseport_ok;
kuid_t uid = sock_i_uid((struct sock *)sk);
/*
@@ -61,7 +142,6 @@ int inet_csk_bind_conflict(const struct sock *sk,
sk_for_each_bound(sk2, &tb->owners) {
if (sk != sk2 &&
- !inet_v6_ipv6only(sk2) &&
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
@@ -71,53 +151,34 @@ int inet_csk_bind_conflict(const struct sock *sk,
rcu_access_pointer(sk->sk_reuseport_cb) ||
(sk2->sk_state != TCP_TIME_WAIT &&
!uid_eq(uid, sock_i_uid(sk2))))) {
-
- if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
- sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+ if (inet_rcv_saddr_equal(sk, sk2, true))
break;
}
if (!relax && reuse && sk2->sk_reuse &&
sk2->sk_state != TCP_LISTEN) {
-
- if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
- sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+ if (inet_rcv_saddr_equal(sk, sk2, true))
break;
}
}
}
return sk2 != NULL;
}
-EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
-/* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
- * We try to allocate an odd port (and leave even ports for connect())
+/*
+ * Find an open port number for the socket. Returns with the
+ * inet_bind_hashbucket lock held.
*/
-int inet_csk_get_port(struct sock *sk, unsigned short snum)
+static struct inet_bind_hashbucket *
+inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret)
{
- bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
- int ret = 1, attempts = 5, port = snum;
- int smallest_size = -1, smallest_port;
+ int port = 0;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
int i, low, high, attempt_half;
struct inet_bind_bucket *tb;
- kuid_t uid = sock_i_uid(sk);
u32 remaining, offset;
- if (port) {
-have_port:
- head = &hinfo->bhash[inet_bhashfn(net, port,
- hinfo->bhash_size)];
- spin_lock_bh(&head->lock);
- inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == port)
- goto tb_found;
-
- goto tb_not_found;
- }
-again:
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
inet_get_local_port_range(net, &low, &high);
@@ -141,8 +202,6 @@ other_half_scan:
* We do the opposite to not pollute connect() users.
*/
offset |= 1U;
- smallest_size = -1;
- smallest_port = low; /* avoid compiler warning */
other_parity_scan:
port = low + offset;
@@ -156,29 +215,17 @@ other_parity_scan:
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == port) {
- if (((tb->fastreuse > 0 && reuse) ||
- (tb->fastreuseport > 0 &&
- sk->sk_reuseport &&
- !rcu_access_pointer(sk->sk_reuseport_cb) &&
- uid_eq(tb->fastuid, uid))) &&
- (tb->num_owners < smallest_size || smallest_size == -1)) {
- smallest_size = tb->num_owners;
- smallest_port = port;
- }
- if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false))
- goto tb_found;
+ if (!inet_csk_bind_conflict(sk, tb, false, false))
+ goto success;
goto next_port;
}
- goto tb_not_found;
+ tb = NULL;
+ goto success;
next_port:
spin_unlock_bh(&head->lock);
cond_resched();
}
- if (smallest_size != -1) {
- port = smallest_port;
- goto have_port;
- }
offset--;
if (!(offset & 1))
goto other_parity_scan;
@@ -188,8 +235,74 @@ next_port:
attempt_half = 2;
goto other_half_scan;
}
- return ret;
+ return NULL;
+success:
+ *port_ret = port;
+ *tb_ret = tb;
+ return head;
+}
+static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
+ struct sock *sk)
+{
+ kuid_t uid = sock_i_uid(sk);
+
+ if (tb->fastreuseport <= 0)
+ return 0;
+ if (!sk->sk_reuseport)
+ return 0;
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ return 0;
+ if (!uid_eq(tb->fastuid, uid))
+ return 0;
+ /* We only need to check the rcv_saddr if this tb was once marked
+ * without fastreuseport and then was reset, as we can only know that
+ * the fast_*rcv_saddr doesn't have any conflicts with the socks on the
+ * owners list.
+ */
+ if (tb->fastreuseport == FASTREUSEPORT_ANY)
+ return 1;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tb->fast_sk_family == AF_INET6)
+ return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr,
+ &sk->sk_v6_rcv_saddr,
+ tb->fast_rcv_saddr,
+ sk->sk_rcv_saddr,
+ tb->fast_ipv6_only,
+ ipv6_only_sock(sk), true);
+#endif
+ return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr,
+ ipv6_only_sock(sk), true);
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ * We try to allocate an odd port (and leave even ports for connect())
+ */
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
+{
+ bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+ struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+ int ret = 1, port = snum;
+ struct inet_bind_hashbucket *head;
+ struct net *net = sock_net(sk);
+ struct inet_bind_bucket *tb = NULL;
+ kuid_t uid = sock_i_uid(sk);
+
+ if (!port) {
+ head = inet_csk_find_open_port(sk, &tb, &port);
+ if (!head)
+ return ret;
+ if (!tb)
+ goto tb_not_found;
+ goto success;
+ }
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ spin_lock_bh(&head->lock);
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (net_eq(ib_net(tb), net) && tb->port == port)
+ goto tb_found;
tb_not_found:
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
net, head, port);
@@ -200,38 +313,54 @@ tb_found:
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
- if (((tb->fastreuse > 0 && reuse) ||
- (tb->fastreuseport > 0 &&
- !rcu_access_pointer(sk->sk_reuseport_cb) &&
- sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
- smallest_size == -1)
+ if ((tb->fastreuse > 0 && reuse) ||
+ sk_reuseport_match(tb, sk))
goto success;
- if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
- if ((reuse ||
- (tb->fastreuseport > 0 &&
- sk->sk_reuseport &&
- !rcu_access_pointer(sk->sk_reuseport_cb) &&
- uid_eq(tb->fastuid, uid))) &&
- smallest_size != -1 && --attempts >= 0) {
- spin_unlock_bh(&head->lock);
- goto again;
- }
+ if (inet_csk_bind_conflict(sk, tb, true, true))
goto fail_unlock;
+ }
+success:
+ if (!hlist_empty(&tb->owners)) {
+ tb->fastreuse = reuse;
+ if (sk->sk_reuseport) {
+ tb->fastreuseport = FASTREUSEPORT_ANY;
+ tb->fastuid = uid;
+ tb->fast_rcv_saddr = sk->sk_rcv_saddr;
+ tb->fast_ipv6_only = ipv6_only_sock(sk);
+#if IS_ENABLED(CONFIG_IPV6)
+ tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+#endif
+ } else {
+ tb->fastreuseport = 0;
}
+ } else {
if (!reuse)
tb->fastreuse = 0;
- if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
- tb->fastreuseport = 0;
- } else {
- tb->fastreuse = reuse;
if (sk->sk_reuseport) {
- tb->fastreuseport = 1;
- tb->fastuid = uid;
+ /* We didn't match or we don't have fastreuseport set on
+ * the tb, but we have sk_reuseport set on this socket
+ * and we know that there are no bind conflicts with
+ * this socket in this tb, so reset our tb's reuseport
+ * settings so that any subsequent sockets that match
+ * our current socket will be put on the fast path.
+ *
+ * If we reset we need to set FASTREUSEPORT_STRICT so we
+ * do extra checking for all subsequent sk_reuseport
+ * socks.
+ */
+ if (!sk_reuseport_match(tb, sk)) {
+ tb->fastreuseport = FASTREUSEPORT_STRICT;
+ tb->fastuid = uid;
+ tb->fast_rcv_saddr = sk->sk_rcv_saddr;
+ tb->fast_ipv6_only = ipv6_only_sock(sk);
+#if IS_ENABLED(CONFIG_IPV6)
+ tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+#endif
+ }
} else {
tb->fastreuseport = 0;
}
}
-success:
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, port);
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
@@ -295,7 +424,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
/*
* This will accept the next outstanding connection.
*/
-struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
+struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
@@ -415,7 +544,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
- htons(ireq->ir_num));
+ htons(ireq->ir_num), sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
@@ -452,7 +581,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
- htons(ireq->ir_num));
+ htons(ireq->ir_num), sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
@@ -707,9 +836,8 @@ void inet_csk_destroy_sock(struct sock *sk)
sk_refcnt_debug_release(sk);
- local_bh_disable();
percpu_counter_dec(sk->sk_prot->orphan_count);
- local_bh_enable();
+
sock_put(sk);
}
EXPORT_SYMBOL(inet_csk_destroy_sock);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e4d16fc5bbb3..3828b3a805cd 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -200,13 +200,22 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
goto errout;
+ /*
+ * RAW sockets might have user-defined protocols assigned,
+ * so report the one supplied on socket creation.
+ */
+ if (sk->sk_type == SOCK_RAW) {
+ if (nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))
+ goto errout;
+ }
+
if (!icsk) {
handler->idiag_get_info(sk, r, NULL);
goto out;
}
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
r->idiag_timer = 1;
r->idiag_retrans = icsk->icsk_retransmits;
@@ -852,10 +861,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct net *net = sock_net(skb->sk);
- int i, num, s_i, s_num;
u32 idiag_states = r->idiag_states;
- bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ int i, num, s_i, s_num;
+ struct sock *sk;
if (idiag_states & TCPF_SYN_RECV)
idiag_states |= TCPF_NEW_SYN_RECV;
@@ -863,16 +873,15 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
s_num = num = cb->args[2];
if (cb->args[0] == 0) {
- if (!(idiag_states & TCPF_LISTEN))
+ if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport)
goto skip_listen_ht;
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
struct inet_listen_hashbucket *ilb;
- struct sock *sk;
num = 0;
ilb = &hashinfo->listening_hash[i];
- spin_lock_bh(&ilb->lock);
+ spin_lock(&ilb->lock);
sk_for_each(sk, &ilb->head) {
struct inet_sock *inet = inet_sk(sk);
@@ -892,26 +901,18 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
r->id.idiag_sport)
goto next_listen;
- if (r->id.idiag_dport ||
- cb->args[3] > 0)
- goto next_listen;
-
if (inet_csk_diag_dump(sk, skb, cb, r,
bc, net_admin) < 0) {
- spin_unlock_bh(&ilb->lock);
+ spin_unlock(&ilb->lock);
goto done;
}
next_listen:
- cb->args[3] = 0;
- cb->args[4] = 0;
++num;
}
- spin_unlock_bh(&ilb->lock);
+ spin_unlock(&ilb->lock);
s_num = 0;
- cb->args[3] = 0;
- cb->args[4] = 0;
}
skip_listen_ht:
cb->args[0] = 1;
@@ -921,13 +922,14 @@ skip_listen_ht:
if (!(idiag_states & ~TCPF_LISTEN))
goto out;
+#define SKARR_SZ 16
for (i = s_i; i <= hashinfo->ehash_mask; i++) {
struct inet_ehash_bucket *head = &hashinfo->ehash[i];
spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
struct hlist_nulls_node *node;
- struct sock *sk;
-
- num = 0;
+ struct sock *sk_arr[SKARR_SZ];
+ int num_arr[SKARR_SZ];
+ int idx, accum, res;
if (hlist_nulls_empty(&head->chain))
continue;
@@ -935,9 +937,12 @@ skip_listen_ht:
if (i > s_i)
s_num = 0;
+next_chunk:
+ num = 0;
+ accum = 0;
spin_lock_bh(lock);
sk_nulls_for_each(sk, node, &head->chain) {
- int state, res;
+ int state;
if (!net_eq(sock_net(sk), net))
continue;
@@ -961,21 +966,35 @@ skip_listen_ht:
if (!inet_diag_bc_sk(bc, sk))
goto next_normal;
- res = sk_diag_fill(sk, skb, r,
+ sock_hold(sk);
+ num_arr[accum] = num;
+ sk_arr[accum] = sk;
+ if (++accum == SKARR_SZ)
+ break;
+next_normal:
+ ++num;
+ }
+ spin_unlock_bh(lock);
+ res = 0;
+ for (idx = 0; idx < accum; idx++) {
+ if (res >= 0) {
+ res = sk_diag_fill(sk_arr[idx], skb, r,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
cb->nlh, net_admin);
- if (res < 0) {
- spin_unlock_bh(lock);
- goto done;
+ if (res < 0)
+ num = num_arr[idx];
}
-next_normal:
- ++num;
+ sock_gen_put(sk_arr[idx]);
}
-
- spin_unlock_bh(lock);
+ if (res < 0)
+ break;
cond_resched();
+ if (accum == SKARR_SZ) {
+ s_num = num + 1;
+ goto next_chunk;
+ }
}
done:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ca97835bfec4..8bea74298173 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -73,7 +73,6 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
tb->port = snum;
tb->fastreuse = 0;
tb->fastreuseport = 0;
- tb->num_owners = 0;
INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain);
}
@@ -96,7 +95,6 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
{
inet_sk(sk)->inet_num = snum;
sk_add_bind_node(sk, &tb->owners);
- tb->num_owners++;
inet_csk(sk)->icsk_bind_hash = tb;
}
@@ -114,7 +112,6 @@ static void __inet_put_port(struct sock *sk)
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
__sk_del_bind_node(sk);
- tb->num_owners--;
inet_csk(sk)->icsk_bind_hash = NULL;
inet_sk(sk)->inet_num = 0;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
@@ -435,10 +432,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
static int inet_reuseport_add_sock(struct sock *sk,
- struct inet_listen_hashbucket *ilb,
- int (*saddr_same)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard))
+ struct inet_listen_hashbucket *ilb)
{
struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
struct sock *sk2;
@@ -451,7 +445,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
inet_csk(sk2)->icsk_bind_hash == tb &&
sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
- saddr_same(sk, sk2, false))
+ inet_rcv_saddr_equal(sk, sk2, false))
return reuseport_add_sock(sk, sk2);
}
@@ -461,10 +455,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
return 0;
}
-int __inet_hash(struct sock *sk, struct sock *osk,
- int (*saddr_same)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard))
+int __inet_hash(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
@@ -479,7 +470,7 @@ int __inet_hash(struct sock *sk, struct sock *osk,
spin_lock(&ilb->lock);
if (sk->sk_reuseport) {
- err = inet_reuseport_add_sock(sk, ilb, saddr_same);
+ err = inet_reuseport_add_sock(sk, ilb);
if (err)
goto unlock;
}
@@ -503,7 +494,7 @@ int inet_hash(struct sock *sk)
if (sk->sk_state != TCP_CLOSE) {
local_bh_disable();
- err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal);
+ err = __inet_hash(sk, NULL);
local_bh_enable();
}
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index ddcd56c08d14..f8aff2c71cde 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -257,8 +257,7 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
}
EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
-void inet_twsk_purge(struct inet_hashinfo *hashinfo,
- struct inet_timewait_death_row *twdr, int family)
+void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
{
struct inet_timewait_sock *tw;
struct sock *sk;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index bbe7f72db9c1..b3cdeec85f1f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -198,6 +198,7 @@ static void ip_expire(unsigned long arg)
qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
net = container_of(qp->q.net, struct net, ipv4.frags);
+ rcu_read_lock();
spin_lock(&qp->q.lock);
if (qp->q.flags & INET_FRAG_COMPLETE)
@@ -207,7 +208,7 @@ static void ip_expire(unsigned long arg)
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
if (!inet_frag_evicting(&qp->q)) {
- struct sk_buff *head = qp->q.fragments;
+ struct sk_buff *clone, *head = qp->q.fragments;
const struct iphdr *iph;
int err;
@@ -216,32 +217,40 @@ static void ip_expire(unsigned long arg)
if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
goto out;
- rcu_read_lock();
head->dev = dev_get_by_index_rcu(net, qp->iif);
if (!head->dev)
- goto out_rcu_unlock;
+ goto out;
+
/* skb has no dst, perform route lookup again */
iph = ip_hdr(head);
err = ip_route_input_noref(head, iph->daddr, iph->saddr,
iph->tos, head->dev);
if (err)
- goto out_rcu_unlock;
+ goto out;
/* Only an end host needs to send an ICMP
* "Fragment Reassembly Timeout" message, per RFC792.
*/
if (frag_expire_skip_icmp(qp->user) &&
(skb_rtable(head)->rt_type != RTN_LOCAL))
- goto out_rcu_unlock;
+ goto out;
+
+ clone = skb_clone(head, GFP_ATOMIC);
/* Send an ICMP "Fragment Reassembly Timeout" message. */
- icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
-out_rcu_unlock:
- rcu_read_unlock();
+ if (clone) {
+ spin_unlock(&qp->q.lock);
+ icmp_send(clone, ICMP_TIME_EXCEEDED,
+ ICMP_EXC_FRAGTIME, 0);
+ consume_skb(clone);
+ goto out_rcu_unlock;
+ }
}
out:
spin_unlock(&qp->q.lock);
+out_rcu_unlock:
+ rcu_read_unlock();
ipq_put(qp);
}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 576f705d8180..c9c1cb635d9a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -17,7 +17,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
@@ -113,8 +113,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
static struct rtnl_link_ops ipgre_link_ops __read_mostly;
static int ipgre_tunnel_init(struct net_device *dev);
-static int ipgre_net_id __read_mostly;
-static int gre_tap_net_id __read_mostly;
+static unsigned int ipgre_net_id __read_mostly;
+static unsigned int gre_tap_net_id __read_mostly;
static void ipgre_err(struct sk_buff *skb, u32 info,
const struct tnl_ptk_info *tpi)
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 4d158ff1def1..93157f2f4758 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -15,7 +15,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unaligned.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 877bdb02e887..7a3fd25e8913 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -42,7 +42,7 @@
* Hirokazu Takahashi: sendfile() on UDP works now.
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -74,6 +74,7 @@
#include <net/checksum.h>
#include <net/inetpeer.h>
#include <net/lwtunnel.h>
+#include <linux/bpf-cgroup.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
@@ -221,7 +222,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
if (unlikely(!neigh))
neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
if (!IS_ERR(neigh)) {
- int res = dst_neigh_output(dst, neigh, skb);
+ int res;
+
+ sock_confirm_neigh(skb, neigh);
+ res = neigh_output(neigh, skb);
rcu_read_unlock_bh();
return res;
@@ -287,6 +291,13 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
unsigned int mtu;
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ if (ret) {
+ kfree_skb(skb);
+ return ret;
+ }
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
@@ -305,6 +316,20 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
return ip_finish_output2(net, sk, skb);
}
+static int ip_mc_finish_output(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ if (ret) {
+ kfree_skb(skb);
+ return ret;
+ }
+
+ return dev_loopback_xmit(net, sk, skb);
+}
+
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
@@ -342,7 +367,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, newskb, NULL, newskb->dev,
- dev_loopback_xmit);
+ ip_mc_finish_output);
}
/* Multicasts with ttl 0 must not go beyond the host */
@@ -358,7 +383,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, newskb, NULL, newskb->dev,
- dev_loopback_xmit);
+ ip_mc_finish_output);
}
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
@@ -583,7 +608,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
*/
if (skb_has_frag_list(skb)) {
struct sk_buff *frag, *frag2;
- int first_len = skb_pagelen(skb);
+ unsigned int first_len = skb_pagelen(skb);
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
@@ -804,11 +829,11 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
struct msghdr *msg = from;
if (skb->ip_summed == CHECKSUM_PARTIAL) {
- if (copy_from_iter(to, len, &msg->msg_iter) != len)
+ if (!copy_from_iter_full(to, len, &msg->msg_iter))
return -EFAULT;
} else {
__wsum csum = 0;
- if (csum_and_copy_from_iter(to, len, &csum, &msg->msg_iter) != len)
+ if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))
return -EFAULT;
skb->csum = csum_block_add(skb->csum, csum, odd);
}
@@ -864,6 +889,9 @@ static inline int ip_ufo_append_data(struct sock *sk,
skb->csum = 0;
+ if (flags & MSG_CONFIRM)
+ skb_set_dst_pending_confirm(skb, 1);
+
__skb_queue_tail(queue, skb);
} else if (skb_is_gso(skb)) {
goto append;
@@ -936,9 +964,9 @@ static int __ip_append_data(struct sock *sk,
csummode = CHECKSUM_PARTIAL;
cork->length += length;
- if (((length > mtu) || (skb && skb_is_gso(skb))) &&
+ if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
- (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
+ (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
(sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
err = ip_ufo_append_data(sk, queue, getfrag, from, length,
hh_len, fragheaderlen, transhdrlen,
@@ -1064,6 +1092,9 @@ alloc_new_skb:
exthdrlen = 0;
csummode = CHECKSUM_NONE;
+ if ((flags & MSG_CONFIRM) && !skb_prev)
+ skb_set_dst_pending_confirm(skb, 1);
+
/*
* Put the packet on the pending queue.
*/
@@ -1594,7 +1625,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
ip_reply_arg_flowi_flags(arg),
daddr, saddr,
- tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
+ tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
+ arg->uid);
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
@@ -1606,6 +1638,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
sk->sk_protocol = ip_hdr(skb)->protocol;
sk->sk_bound_dev_if = arg->bound_dev_if;
sk->sk_sndbuf = sysctl_wmem_default;
+ sk->sk_mark = fl4.flowi4_mark;
err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
len, 0, &ipc, &rt, MSG_DONTWAIT);
if (unlikely(err)) {
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index b8a2d63d1fb8..1d46d05efb0f 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -44,7 +44,7 @@
#include <net/ip_fib.h>
#include <linux/errqueue.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* SOL_IP control messages.
@@ -97,6 +97,17 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
}
+static void ip_cmsg_recv_fragsize(struct msghdr *msg, struct sk_buff *skb)
+{
+ int val;
+
+ if (IPCB(skb)->frag_max_size == 0)
+ return;
+
+ val = IPCB(skb)->frag_max_size;
+ put_cmsg(msg, SOL_IP, IP_RECVFRAGSIZE, sizeof(val), &val);
+}
+
static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
int tlen, int offset)
{
@@ -105,10 +116,10 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
if (skb->ip_summed != CHECKSUM_COMPLETE)
return;
- if (offset != 0)
- csum = csum_sub(csum,
- csum_partial(skb_transport_header(skb) + tlen,
- offset, 0));
+ if (offset != 0) {
+ int tend_off = skb_transport_offset(skb) + tlen;
+ csum = csum_sub(csum, skb_checksum(skb, tend_off, offset, 0));
+ }
put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum);
}
@@ -137,7 +148,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
const struct iphdr *iph = ip_hdr(skb);
__be16 *ports = (__be16 *)skb_transport_header(skb);
- if (skb_transport_offset(skb) + 4 > skb->len)
+ if (skb_transport_offset(skb) + 4 > (int)skb->len)
return;
/* All current transport protocols have the port numbers in the
@@ -153,10 +164,10 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
}
-void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb,
- int tlen, int offset)
+void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb, int tlen, int offset)
{
- struct inet_sock *inet = inet_sk(skb->sk);
+ struct inet_sock *inet = inet_sk(sk);
unsigned int flags = inet->cmsg_flags;
/* Ordered by supposed usage frequency */
@@ -218,6 +229,9 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb,
if (flags & IP_CMSG_CHECKSUM)
ip_cmsg_recv_checksum(msg, skb, tlen, offset);
+
+ if (flags & IP_CMSG_RECVFRAGSIZE)
+ ip_cmsg_recv_fragsize(msg, skb);
}
EXPORT_SYMBOL(ip_cmsg_recv_offset);
@@ -258,7 +272,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
continue;
switch (cmsg->cmsg_type) {
case IP_RETOPTS:
- err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+ err = cmsg->cmsg_len - sizeof(struct cmsghdr);
/* Our caller is responsible for freeing ipc->opt */
err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
@@ -474,16 +488,15 @@ static bool ipv4_datagram_support_cmsg(const struct sock *sk,
return false;
/* Support IP_PKTINFO on tstamp packets if requested, to correlate
- * timestamp with egress dev. Not possible for packets without dev
+ * timestamp with egress dev. Not possible for packets without iif
* or without payload (SOF_TIMESTAMPING_OPT_TSONLY).
*/
- if ((!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) ||
- (!skb->dev))
+ info = PKTINFO_SKB_CB(skb);
+ if (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG) ||
+ !info->ipi_ifindex)
return false;
- info = PKTINFO_SKB_CB(skb);
info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr;
- info->ipi_ifindex = skb->dev->ifindex;
return true;
}
@@ -577,6 +590,7 @@ static bool setsockopt_needs_rtnl(int optname)
case MCAST_LEAVE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
case MCAST_UNBLOCK_SOURCE:
+ case IP_ROUTER_ALERT:
return true;
}
return false;
@@ -614,6 +628,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
case IP_MULTICAST_LOOP:
case IP_RECVORIGDSTADDR:
case IP_CHECKSUM:
+ case IP_RECVFRAGSIZE:
if (optlen >= sizeof(int)) {
if (get_user(val, (int __user *) optval))
return -EFAULT;
@@ -726,6 +741,14 @@ static int do_ip_setsockopt(struct sock *sk, int level,
}
}
break;
+ case IP_RECVFRAGSIZE:
+ if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM)
+ goto e_inval;
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_RECVFRAGSIZE;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE;
+ break;
case IP_TOS: /* This sets both TOS and Precedence */
if (sk->sk_type == SOCK_STREAM) {
val &= ~INET_ECN_MASK;
@@ -820,6 +843,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
{
struct ip_mreqn mreq;
struct net_device *dev = NULL;
+ int midx;
if (sk->sk_type == SOCK_STREAM)
goto e_inval;
@@ -864,11 +888,15 @@ static int do_ip_setsockopt(struct sock *sk, int level,
err = -EADDRNOTAVAIL;
if (!dev)
break;
+
+ midx = l3mdev_master_ifindex(dev);
+
dev_put(dev);
err = -EINVAL;
if (sk->sk_bound_dev_if &&
- mreq.imr_ifindex != sk->sk_bound_dev_if)
+ mreq.imr_ifindex != sk->sk_bound_dev_if &&
+ (!midx || midx != sk->sk_bound_dev_if))
break;
inet->mc_index = mreq.imr_ifindex;
@@ -1202,14 +1230,27 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
* which has interface index (iif) as the first member of the
* underlying inet{6}_skb_parm struct. This code then overlays
* PKTINFO_SKB_CB and in_pktinfo also has iif as the first
- * element so the iif is picked up from the prior IPCB
+ * element so the iif is picked up from the prior IPCB. If iif
+ * is the loopback interface, then return the sending interface
+ * (e.g., process binds socket to eth0 for Tx which is
+ * redirected to loopback in the rtable/dst).
*/
+ if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX)
+ pktinfo->ipi_ifindex = inet_iif(skb);
+
pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
} else {
pktinfo->ipi_ifindex = 0;
pktinfo->ipi_spec_dst.s_addr = 0;
}
- skb_dst_drop(skb);
+ /* We need to keep the dst for __ip_options_echo()
+ * We could restrict the test to opt.ts_needtime || opt.srr,
+ * but the following is good enough as IP options are not often used.
+ */
+ if (unlikely(IPCB(skb)->opt.optlen))
+ skb_dst_force(skb);
+ else
+ skb_dst_drop(skb);
}
int ip_setsockopt(struct sock *sk, int level,
@@ -1357,6 +1398,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_CHECKSUM:
val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0;
break;
+ case IP_RECVFRAGSIZE:
+ val = (inet->cmsg_flags & IP_CMSG_RECVFRAGSIZE) != 0;
+ break;
case IP_TOS:
val = inet->tos;
break;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 5719d6ba0824..823abaef006b 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -358,6 +358,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
{
struct ip_tunnel *nt;
struct net_device *dev;
+ int t_hlen;
BUG_ON(!itn->fb_tunnel_dev);
dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
@@ -367,6 +368,9 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
dev->mtu = ip_tunnel_bind_dev(dev);
nt = netdev_priv(dev);
+ t_hlen = nt->hlen + sizeof(struct iphdr);
+ dev->min_mtu = ETH_MIN_MTU;
+ dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
ip_tunnel_add(itn, nt);
return nt;
}
@@ -929,7 +933,7 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
- if (new_mtu < 68)
+ if (new_mtu < ETH_MIN_MTU)
return -EINVAL;
if (new_mtu > max_mtu) {
@@ -990,7 +994,7 @@ int ip_tunnel_get_iflink(const struct net_device *dev)
}
EXPORT_SYMBOL(ip_tunnel_get_iflink);
-int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
+int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
struct rtnl_link_ops *ops, char *devname)
{
struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
@@ -1192,7 +1196,7 @@ void ip_tunnel_uninit(struct net_device *dev)
EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
/* Do least required initialization, rest of init is done in tunnel_init call */
-void ip_tunnel_setup(struct net_device *dev, int net_id)
+void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
tunnel->ip_tnl_net_id = net_id;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index fed3d29f9eb3..a31f47ccaad9 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -188,8 +188,8 @@ int iptunnel_handle_offloads(struct sk_buff *skb,
EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
/* Often modified stats are per cpu, other are shared (netdev->stats) */
-struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *tot)
+void ip_tunnel_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *tot)
{
int i;
@@ -214,8 +214,6 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
tot->rx_bytes += rx_bytes;
tot->tx_bytes += tx_bytes;
}
-
- return tot;
}
EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
@@ -228,7 +226,7 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
[LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 },
};
-static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
+static int ip_tun_build_state(struct nlattr *attr,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts)
{
@@ -313,6 +311,7 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
.fill_encap = ip_tun_fill_encap_info,
.get_encap_size = ip_tun_encap_nlsize,
.cmp_encap = ip_tun_cmp_encap,
+ .owner = THIS_MODULE,
};
static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
@@ -324,7 +323,7 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
[LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 },
};
-static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr,
+static int ip6_tun_build_state(struct nlattr *attr,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts)
{
@@ -403,6 +402,7 @@ static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
.fill_encap = ip6_tun_fill_encap_info,
.get_encap_size = ip6_tun_encap_nlsize,
.cmp_encap = ip_tun_cmp_encap,
+ .owner = THIS_MODULE,
};
void __init ip_tunnel_core_init(void)
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 5d7944f394d9..8b14f1404c8f 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -46,7 +46,7 @@
static struct rtnl_link_ops vti_link_ops __read_mostly;
-static int vti_net_id __read_mostly;
+static unsigned int vti_net_id __read_mostly;
static int vti_tunnel_init(struct net_device *dev);
static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 071a785c65eb..dfb2ab2dd3c8 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -61,7 +61,7 @@
#include <net/ipconfig.h>
#include <net/route.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/checksum.h>
#include <asm/processor.h>
@@ -306,7 +306,7 @@ static void __init ic_close_devs(void)
while ((d = next)) {
next = d->next;
dev = d->dev;
- if ((!ic_dev || dev != ic_dev->dev) && !netdev_uses_dsa(dev)) {
+ if (d != ic_dev && !netdev_uses_dsa(dev)) {
pr_debug("IP-Config: Downing %s\n", dev->name);
dev_change_flags(dev, d->flags);
}
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index c9392589c415..00d4229b6954 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -96,7 +96,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
@@ -121,7 +121,7 @@ static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
-static int ipip_net_id __read_mostly;
+static unsigned int ipip_net_id __read_mostly;
static int ipip_tunnel_init(struct net_device *dev);
static struct rtnl_link_ops ipip_link_ops __read_mostly;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 27089f5ebbb1..b036e85e093b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -26,7 +26,7 @@
*
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/capability.h>
#include <linux/errno.h>
@@ -137,6 +137,9 @@ static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
.flags = FIB_LOOKUP_NOREF,
};
+ /* update flow if oif or iif point to device enslaved to l3mdev */
+ l3mdev_update_flow(net, flowi4_to_flowi(flp4));
+
err = fib_rules_lookup(net->ipv4.mr_rules_ops,
flowi4_to_flowi(flp4), 0, &arg);
if (err < 0)
@@ -163,7 +166,9 @@ static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
return -EINVAL;
}
- mrt = ipmr_get_table(rule->fr_net, rule->table);
+ arg->table = fib_rule_get_table(rule, arg);
+
+ mrt = ipmr_get_table(rule->fr_net, arg->table);
if (!mrt)
return -EAGAIN;
res->mrt = mrt;
@@ -294,10 +299,29 @@ static void __net_exit ipmr_rules_exit(struct net *net)
}
#endif
+static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct mfc_cache_cmp_arg *cmparg = arg->key;
+ struct mfc_cache *c = (struct mfc_cache *)ptr;
+
+ return cmparg->mfc_mcastgrp != c->mfc_mcastgrp ||
+ cmparg->mfc_origin != c->mfc_origin;
+}
+
+static const struct rhashtable_params ipmr_rht_params = {
+ .head_offset = offsetof(struct mfc_cache, mnode),
+ .key_offset = offsetof(struct mfc_cache, cmparg),
+ .key_len = sizeof(struct mfc_cache_cmp_arg),
+ .nelem_hint = 3,
+ .locks_mul = 1,
+ .obj_cmpfn = ipmr_hash_cmp,
+ .automatic_shrinking = true,
+};
+
static struct mr_table *ipmr_new_table(struct net *net, u32 id)
{
struct mr_table *mrt;
- unsigned int i;
/* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
if (id != RT_TABLE_DEFAULT && id >= 1000000000)
@@ -313,10 +337,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
write_pnet(&mrt->net, net);
mrt->id = id;
- /* Forwarding cache */
- for (i = 0; i < MFC_LINES; i++)
- INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
-
+ rhltable_init(&mrt->mfc_hash, &ipmr_rht_params);
+ INIT_LIST_HEAD(&mrt->mfc_cache_list);
INIT_LIST_HEAD(&mrt->mfc_unres_queue);
setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
@@ -333,6 +355,7 @@ static void ipmr_free_table(struct mr_table *mrt)
{
del_timer_sync(&mrt->ipmr_expire_timer);
mroute_clean_tables(mrt, true);
+ rhltable_destroy(&mrt->mfc_hash);
kfree(mrt);
}
@@ -834,13 +857,17 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
__be32 origin,
__be32 mcastgrp)
{
- int line = MFC_HASH(mcastgrp, origin);
+ struct mfc_cache_cmp_arg arg = {
+ .mfc_mcastgrp = mcastgrp,
+ .mfc_origin = origin
+ };
+ struct rhlist_head *tmp, *list;
struct mfc_cache *c;
- list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
- if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
- return c;
- }
+ list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+ rhl_for_each_entry_rcu(c, tmp, list, mnode)
+ return c;
+
return NULL;
}
@@ -848,13 +875,16 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
int vifi)
{
- int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY));
+ struct mfc_cache_cmp_arg arg = {
+ .mfc_mcastgrp = htonl(INADDR_ANY),
+ .mfc_origin = htonl(INADDR_ANY)
+ };
+ struct rhlist_head *tmp, *list;
struct mfc_cache *c;
- list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
- if (c->mfc_origin == htonl(INADDR_ANY) &&
- c->mfc_mcastgrp == htonl(INADDR_ANY) &&
- c->mfc_un.res.ttls[vifi] < 255)
+ list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+ rhl_for_each_entry_rcu(c, tmp, list, mnode)
+ if (c->mfc_un.res.ttls[vifi] < 255)
return c;
return NULL;
@@ -864,29 +894,51 @@ static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
__be32 mcastgrp, int vifi)
{
- int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY));
+ struct mfc_cache_cmp_arg arg = {
+ .mfc_mcastgrp = mcastgrp,
+ .mfc_origin = htonl(INADDR_ANY)
+ };
+ struct rhlist_head *tmp, *list;
struct mfc_cache *c, *proxy;
if (mcastgrp == htonl(INADDR_ANY))
goto skip;
- list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
- if (c->mfc_origin == htonl(INADDR_ANY) &&
- c->mfc_mcastgrp == mcastgrp) {
- if (c->mfc_un.res.ttls[vifi] < 255)
- return c;
-
- /* It's ok if the vifi is part of the static tree */
- proxy = ipmr_cache_find_any_parent(mrt,
- c->mfc_parent);
- if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
- return c;
- }
+ list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+ rhl_for_each_entry_rcu(c, tmp, list, mnode) {
+ if (c->mfc_un.res.ttls[vifi] < 255)
+ return c;
+
+ /* It's ok if the vifi is part of the static tree */
+ proxy = ipmr_cache_find_any_parent(mrt, c->mfc_parent);
+ if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
+ return c;
+ }
skip:
return ipmr_cache_find_any_parent(mrt, vifi);
}
+/* Look for a (S,G,iif) entry if parent != -1 */
+static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt,
+ __be32 origin, __be32 mcastgrp,
+ int parent)
+{
+ struct mfc_cache_cmp_arg arg = {
+ .mfc_mcastgrp = mcastgrp,
+ .mfc_origin = origin,
+ };
+ struct rhlist_head *tmp, *list;
+ struct mfc_cache *c;
+
+ list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+ rhl_for_each_entry_rcu(c, tmp, list, mnode)
+ if (parent == -1 || parent == c->mfc_parent)
+ return c;
+
+ return NULL;
+}
+
/* Allocate a multicast cache entry */
static struct mfc_cache *ipmr_cache_alloc(void)
{
@@ -1023,10 +1075,10 @@ static int ipmr_cache_report(struct mr_table *mrt,
static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
struct sk_buff *skb)
{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct mfc_cache *c;
bool found = false;
int err;
- struct mfc_cache *c;
- const struct iphdr *iph = ip_hdr(skb);
spin_lock_bh(&mfc_unres_lock);
list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
@@ -1090,46 +1142,39 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
{
- int line;
- struct mfc_cache *c, *next;
+ struct mfc_cache *c;
- line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+ /* The entries are added/deleted only under RTNL */
+ rcu_read_lock();
+ c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
+ mfc->mfcc_mcastgrp.s_addr, parent);
+ rcu_read_unlock();
+ if (!c)
+ return -ENOENT;
+ rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
+ list_del_rcu(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
+ ipmr_cache_free(c);
- list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
- if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
- c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
- (parent == -1 || parent == c->mfc_parent)) {
- list_del_rcu(&c->list);
- mroute_netlink_event(mrt, c, RTM_DELROUTE);
- ipmr_cache_free(c);
- return 0;
- }
- }
- return -ENOENT;
+ return 0;
}
static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
struct mfcctl *mfc, int mrtsock, int parent)
{
- bool found = false;
- int line;
struct mfc_cache *uc, *c;
+ bool found;
+ int ret;
if (mfc->mfcc_parent >= MAXVIFS)
return -ENFILE;
- line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
-
- list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
- if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
- c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
- (parent == -1 || parent == c->mfc_parent)) {
- found = true;
- break;
- }
- }
-
- if (found) {
+ /* The entries are added/deleted only under RTNL */
+ rcu_read_lock();
+ c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
+ mfc->mfcc_mcastgrp.s_addr, parent);
+ rcu_read_unlock();
+ if (c) {
write_lock_bh(&mrt_lock);
c->mfc_parent = mfc->mfcc_parent;
ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
@@ -1155,8 +1200,14 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
if (!mrtsock)
c->mfc_flags |= MFC_STATIC;
- list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
-
+ ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode,
+ ipmr_rht_params);
+ if (ret) {
+ pr_err("ipmr: rhtable insert error %d\n", ret);
+ ipmr_cache_free(c);
+ return ret;
+ }
+ list_add_tail_rcu(&c->list, &mrt->mfc_cache_list);
/* Check to see if we resolved a queued list. If so we
* need to send on the frames and tidy up.
*/
@@ -1186,9 +1237,9 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
/* Close the multicast socket, and clear the vif tables etc */
static void mroute_clean_tables(struct mr_table *mrt, bool all)
{
- int i;
+ struct mfc_cache *c, *tmp;
LIST_HEAD(list);
- struct mfc_cache *c, *next;
+ int i;
/* Shut down all active vif entries */
for (i = 0; i < mrt->maxvif; i++) {
@@ -1199,19 +1250,18 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
unregister_netdevice_many(&list);
/* Wipe the cache */
- for (i = 0; i < MFC_LINES; i++) {
- list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
- if (!all && (c->mfc_flags & MFC_STATIC))
- continue;
- list_del_rcu(&c->list);
- mroute_netlink_event(mrt, c, RTM_DELROUTE);
- ipmr_cache_free(c);
- }
+ list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
+ if (!all && (c->mfc_flags & MFC_STATIC))
+ continue;
+ rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
+ list_del_rcu(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
+ ipmr_cache_free(c);
}
if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
spin_lock_bh(&mfc_unres_lock);
- list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
+ list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
list_del(&c->list);
mroute_netlink_event(mrt, c, RTM_DELROUTE);
ipmr_destroy_unres(mrt, c);
@@ -1228,7 +1278,7 @@ static void mrtsock_destruct(struct sock *sk)
struct net *net = sock_net(sk);
struct mr_table *mrt;
- rtnl_lock();
+ ASSERT_RTNL();
ipmr_for_each_table(mrt, net) {
if (sk == rtnl_dereference(mrt->mroute_sk)) {
IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
@@ -1239,7 +1289,6 @@ static void mrtsock_destruct(struct sock *sk)
mroute_clean_tables(mrt, false);
}
}
- rtnl_unlock();
}
/* Socket options and virtual interface manipulation. The whole
@@ -1303,13 +1352,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
if (sk != rcu_access_pointer(mrt->mroute_sk)) {
ret = -EACCES;
} else {
- /* We need to unlock here because mrtsock_destruct takes
- * care of rtnl itself and we can't change that due to
- * the IP_ROUTER_ALERT setsockopt which runs without it.
- */
- rtnl_unlock();
ret = ip_ra_control(sk, 0, NULL);
- goto out;
+ goto out_unlock;
}
break;
case MRT_ADD_VIF:
@@ -1420,7 +1464,6 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
}
out_unlock:
rtnl_unlock();
-out:
return ret;
}
@@ -1786,9 +1829,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
struct sk_buff *skb, struct mfc_cache *cache,
int local)
{
+ int true_vifi = ipmr_find_vif(mrt, skb->dev);
int psend = -1;
int vif, ct;
- int true_vifi = ipmr_find_vif(mrt, skb->dev);
vif = cache->mfc_parent;
cache->mfc_un.res.pkt++;
@@ -1809,6 +1852,12 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
/* Wrong interface: drop packet and (maybe) send PIM assert. */
if (mrt->vif_table[vif].dev != skb->dev) {
+ struct net_device *mdev;
+
+ mdev = l3mdev_master_dev_rcu(mrt->vif_table[vif].dev);
+ if (mdev == skb->dev)
+ goto forward;
+
if (rt_is_output_route(skb_rtable(skb))) {
/* It is our own packet, looped back.
* Very complicated situation...
@@ -2053,7 +2102,7 @@ static int pim_rcv(struct sk_buff *skb)
goto drop;
pim = (struct pimreghdr *)skb_transport_header(skb);
- if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
+ if (pim->type != ((PIM_VERSION << 4) | (PIM_TYPE_REGISTER)) ||
(pim->flags & PIM_NULL_REGISTER) ||
(ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
csum_fold(skb_checksum(skb, 0, skb->len, 0))))
@@ -2080,8 +2129,10 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
int ct;
/* If cache is unresolved, don't try to parse IIF and OIF */
- if (c->mfc_parent >= MAXVIFS)
+ if (c->mfc_parent >= MAXVIFS) {
+ rtm->rtm_flags |= RTNH_F_UNRESOLVED;
return -ENOENT;
+ }
if (VIF_EXISTS(mrt, c->mfc_parent) &&
nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
@@ -2123,7 +2174,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
int ipmr_get_route(struct net *net, struct sk_buff *skb,
__be32 saddr, __be32 daddr,
- struct rtmsg *rtm, int nowait, u32 portid)
+ struct rtmsg *rtm, u32 portid)
{
struct mfc_cache *cache;
struct mr_table *mrt;
@@ -2147,11 +2198,6 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
struct net_device *dev;
int vif = -1;
- if (nowait) {
- rcu_read_unlock();
- return -EAGAIN;
- }
-
dev = skb->dev;
read_lock(&mrt_lock);
if (dev)
@@ -2285,34 +2331,30 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
struct mr_table *mrt;
struct mfc_cache *mfc;
unsigned int t = 0, s_t;
- unsigned int h = 0, s_h;
unsigned int e = 0, s_e;
s_t = cb->args[0];
- s_h = cb->args[1];
- s_e = cb->args[2];
+ s_e = cb->args[1];
rcu_read_lock();
ipmr_for_each_table(mrt, net) {
if (t < s_t)
goto next_table;
- if (t > s_t)
- s_h = 0;
- for (h = s_h; h < MFC_LINES; h++) {
- list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
- if (e < s_e)
- goto next_entry;
- if (ipmr_fill_mroute(mrt, skb,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- mfc, RTM_NEWROUTE,
- NLM_F_MULTI) < 0)
- goto done;
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+ if (e < s_e)
+ goto next_entry;
+ if (ipmr_fill_mroute(mrt, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ mfc, RTM_NEWROUTE,
+ NLM_F_MULTI) < 0)
+ goto done;
next_entry:
- e++;
- }
- e = s_e = 0;
+ e++;
}
+ e = 0;
+ s_e = 0;
+
spin_lock_bh(&mfc_unres_lock);
list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
if (e < s_e)
@@ -2329,16 +2371,15 @@ next_entry2:
e++;
}
spin_unlock_bh(&mfc_unres_lock);
- e = s_e = 0;
- s_h = 0;
+ e = 0;
+ s_e = 0;
next_table:
t++;
}
done:
rcu_read_unlock();
- cb->args[2] = e;
- cb->args[1] = h;
+ cb->args[1] = e;
cb->args[0] = t;
return skb->len;
@@ -2548,7 +2589,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
const char *name = vif->dev ? vif->dev->name : "none";
seq_printf(seq,
- "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
+ "%2zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
vif - mrt->vif_table,
name, vif->bytes_in, vif->pkt_in,
vif->bytes_out, vif->pkt_out,
@@ -2582,10 +2623,8 @@ struct ipmr_mfc_iter {
struct seq_net_private p;
struct mr_table *mrt;
struct list_head *cache;
- int ct;
};
-
static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
struct ipmr_mfc_iter *it, loff_t pos)
{
@@ -2593,12 +2632,10 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
struct mfc_cache *mfc;
rcu_read_lock();
- for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
- it->cache = &mrt->mfc_cache_array[it->ct];
- list_for_each_entry_rcu(mfc, it->cache, list)
- if (pos-- == 0)
- return mfc;
- }
+ it->cache = &mrt->mfc_cache_list;
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
+ if (pos-- == 0)
+ return mfc;
rcu_read_unlock();
spin_lock_bh(&mfc_unres_lock);
@@ -2625,17 +2662,16 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
it->mrt = mrt;
it->cache = NULL;
- it->ct = 0;
return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
: SEQ_START_TOKEN;
}
static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct mfc_cache *mfc = v;
struct ipmr_mfc_iter *it = seq->private;
struct net *net = seq_file_net(seq);
struct mr_table *mrt = it->mrt;
+ struct mfc_cache *mfc = v;
++*pos;
@@ -2648,19 +2684,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
if (it->cache == &mrt->mfc_unres_queue)
goto end_of_list;
- BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
-
- while (++it->ct < MFC_LINES) {
- it->cache = &mrt->mfc_cache_array[it->ct];
- if (list_empty(it->cache))
- continue;
- return list_first_entry(it->cache, struct mfc_cache, list);
- }
-
/* exhausted cache_array, show unresolved */
rcu_read_unlock();
it->cache = &mrt->mfc_unres_queue;
- it->ct = 0;
spin_lock_bh(&mfc_unres_lock);
if (!list_empty(it->cache))
@@ -2680,7 +2706,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
if (it->cache == &mrt->mfc_unres_queue)
spin_unlock_bh(&mfc_unres_lock);
- else if (it->cache == &mrt->mfc_cache_array[it->ct])
+ else if (it->cache == &mrt->mfc_cache_list)
rcu_read_unlock();
}
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index b3cc1335adbc..c0cc6aa8cfaa 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -23,7 +23,8 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
struct rtable *rt;
struct flowi4 fl4 = {};
__be32 saddr = iph->saddr;
- __u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
+ const struct sock *sk = skb_to_full_sk(skb);
+ __u8 flags = sk ? inet_sk_flowi_flags(sk) : 0;
struct net_device *dev = skb_dst(skb)->dev;
unsigned int hh_len;
@@ -40,7 +41,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
fl4.daddr = iph->daddr;
fl4.saddr = saddr;
fl4.flowi4_tos = RT_TOS(iph->tos);
- fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
+ fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
if (!fl4.flowi4_oif)
fl4.flowi4_oif = l3mdev_master_ifindex(dev);
fl4.flowi4_mark = skb->mark;
@@ -61,7 +62,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
struct dst_entry *dst = skb_dst(skb);
skb_dst_set(skb, NULL);
- dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
+ dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
skb_dst_set(skb, dst);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index d613309e3e5d..c11eb1744ab1 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -25,6 +25,12 @@ config NF_CONNTRACK_IPV4
To compile it as a module, choose M here. If unsure, say N.
+config NF_SOCKET_IPV4
+ tristate "IPv4 socket lookup support"
+ help
+ This option enables the IPv4 socket lookup infrastructure. This is
+ is required by the iptables socket match.
+
if NF_TABLES
config NF_TABLES_IPV4
@@ -54,6 +60,14 @@ config NFT_DUP_IPV4
help
This module enables IPv4 packet duplication support for nf_tables.
+config NFT_FIB_IPV4
+ select NFT_FIB
+ tristate "nf_tables fib / ip route lookup support"
+ help
+ This module enables IPv4 FIB lookups, e.g. for reverse path filtering.
+ It also allows query of the FIB for the route type, e.g. local, unicast,
+ multicast or blackhole.
+
endif # NF_TABLES_IPV4
config NF_TABLES_ARP
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 853328f8fd05..f462fee66ac8 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -14,6 +14,8 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
# defrag
obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
+obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o
+
# logging
obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o
@@ -34,6 +36,7 @@ obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
+obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o
obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o
obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o
obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 697538464e6e..6241a81fd7f5 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -24,7 +24,7 @@
#include <linux/err.h>
#include <net/compat.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_arp/arp_tables.h>
@@ -217,11 +217,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
*/
e = get_entry(table_base, private->hook_entry[hook]);
- acpar.net = state->net;
- acpar.in = state->in;
- acpar.out = state->out;
- acpar.hooknum = hook;
- acpar.family = NFPROTO_ARP;
+ acpar.state = state;
acpar.hotdrop = false;
arp = arp_hdr(skb);
@@ -415,17 +411,15 @@ static inline int check_target(struct arpt_entry *e, const char *name)
}
static inline int
-find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
+find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
+ struct xt_percpu_counter_alloc_state *alloc_state)
{
struct xt_entry_target *t;
struct xt_target *target;
- unsigned long pcnt;
int ret;
- pcnt = xt_percpu_counter_alloc();
- if (IS_ERR_VALUE(pcnt))
+ if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
return -ENOMEM;
- e->counters.pcnt = pcnt;
t = arpt_get_target(e);
target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
@@ -443,7 +437,7 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
err:
module_put(t->u.kernel.target->me);
out:
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
return ret;
}
@@ -523,7 +517,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
}
/* Checks and translates the user-supplied table segment (held in
@@ -532,6 +526,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
static int translate_table(struct xt_table_info *newinfo, void *entry0,
const struct arpt_replace *repl)
{
+ struct xt_percpu_counter_alloc_state alloc_state = { 0 };
struct arpt_entry *iter;
unsigned int *offsets;
unsigned int i;
@@ -594,7 +589,8 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
/* Finally, each sanity check must pass */
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
- ret = find_check_entry(iter, repl->name, repl->size);
+ ret = find_check_entry(iter, repl->name, repl->size,
+ &alloc_state);
if (ret != 0)
break;
++i;
@@ -681,11 +677,6 @@ static int copy_entries_to_user(unsigned int total_size,
return PTR_ERR(counters);
loc_cpu_entry = private->entries;
- /* ... then copy entire thing ... */
- if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
- ret = -EFAULT;
- goto free_counters;
- }
/* FIXME: use iterator macros --RR */
/* ... then go back and fix counters and names */
@@ -693,6 +684,10 @@ static int copy_entries_to_user(unsigned int total_size,
const struct xt_entry_target *t;
e = (struct arpt_entry *)(loc_cpu_entry + off);
+ if (copy_to_user(userptr + off, e, sizeof(*e))) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
if (copy_to_user(userptr + off
+ offsetof(struct arpt_entry, counters),
&counters[num],
@@ -702,11 +697,7 @@ static int copy_entries_to_user(unsigned int total_size,
}
t = arpt_get_target_c(e);
- if (copy_to_user(userptr + off + e->target_offset
- + offsetof(struct xt_entry_target,
- u.user.name),
- t->u.kernel.target->name,
- strlen(t->u.kernel.target->name)+1) != 0) {
+ if (xt_target_to_user(t, userptr + off + e->target_offset)) {
ret = -EFAULT;
goto free_counters;
}
@@ -809,7 +800,7 @@ static int get_info(struct net *net, void __user *user,
#endif
t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
"arptable_%s", name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
struct arpt_getinfo info;
const struct xt_table_info *private = t->private;
#ifdef CONFIG_COMPAT
@@ -838,7 +829,7 @@ static int get_info(struct net *net, void __user *user,
xt_table_unlock(t);
module_put(t->me);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
#ifdef CONFIG_COMPAT
if (compat)
xt_compat_unlock(NFPROTO_ARP);
@@ -863,7 +854,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
const struct xt_table_info *private = t->private;
if (get.size == private->size)
@@ -875,7 +866,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
return ret;
}
@@ -902,8 +893,8 @@ static int __do_replace(struct net *net, const char *name,
t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
"arptable_%s", name);
- if (IS_ERR_OR_NULL(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ if (!t) {
+ ret = -ENOENT;
goto free_newinfo_counters_untrans;
}
@@ -1018,8 +1009,8 @@ static int do_add_counters(struct net *net, const void __user *user,
return PTR_ERR(paddc);
t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name);
- if (IS_ERR_OR_NULL(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ if (!t) {
+ ret = -ENOENT;
goto free;
}
@@ -1408,7 +1399,7 @@ static int compat_get_entries(struct net *net,
xt_compat_lock(NFPROTO_ARP);
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
@@ -1423,7 +1414,7 @@ static int compat_get_entries(struct net *net,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
xt_compat_unlock(NFPROTO_ARP);
return ret;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 7c00ce90adb8..384b85713e06 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -20,7 +20,7 @@
#include <linux/icmp.h>
#include <net/ip.h>
#include <net/compat.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/mutex.h>
#include <linux/proc_fs.h>
#include <linux/err.h>
@@ -261,11 +261,7 @@ ipt_do_table(struct sk_buff *skb,
acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
acpar.thoff = ip_hdrlen(skb);
acpar.hotdrop = false;
- acpar.net = state->net;
- acpar.in = state->in;
- acpar.out = state->out;
- acpar.family = NFPROTO_IPV4;
- acpar.hooknum = hook;
+ acpar.state = state;
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
local_bh_disable();
@@ -535,7 +531,8 @@ static int check_target(struct ipt_entry *e, struct net *net, const char *name)
static int
find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
- unsigned int size)
+ unsigned int size,
+ struct xt_percpu_counter_alloc_state *alloc_state)
{
struct xt_entry_target *t;
struct xt_target *target;
@@ -543,12 +540,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
unsigned int j;
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
- unsigned long pcnt;
- pcnt = xt_percpu_counter_alloc();
- if (IS_ERR_VALUE(pcnt))
+ if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
return -ENOMEM;
- e->counters.pcnt = pcnt;
j = 0;
mtpar.net = net;
@@ -586,7 +580,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
cleanup_match(ematch, net);
}
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
return ret;
}
@@ -674,7 +668,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
}
/* Checks and translates the user-supplied table segment (held in
@@ -683,6 +677,7 @@ static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
const struct ipt_replace *repl)
{
+ struct xt_percpu_counter_alloc_state alloc_state = { 0 };
struct ipt_entry *iter;
unsigned int *offsets;
unsigned int i;
@@ -742,7 +737,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
/* Finally, each sanity check must pass */
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
- ret = find_check_entry(iter, net, repl->name, repl->size);
+ ret = find_check_entry(iter, net, repl->name, repl->size,
+ &alloc_state);
if (ret != 0)
break;
++i;
@@ -830,10 +826,6 @@ copy_entries_to_user(unsigned int total_size,
return PTR_ERR(counters);
loc_cpu_entry = private->entries;
- if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
- ret = -EFAULT;
- goto free_counters;
- }
/* FIXME: use iterator macros --RR */
/* ... then go back and fix counters and names */
@@ -843,6 +835,10 @@ copy_entries_to_user(unsigned int total_size,
const struct xt_entry_target *t;
e = (struct ipt_entry *)(loc_cpu_entry + off);
+ if (copy_to_user(userptr + off, e, sizeof(*e))) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
if (copy_to_user(userptr + off
+ offsetof(struct ipt_entry, counters),
&counters[num],
@@ -856,23 +852,14 @@ copy_entries_to_user(unsigned int total_size,
i += m->u.match_size) {
m = (void *)e + i;
- if (copy_to_user(userptr + off + i
- + offsetof(struct xt_entry_match,
- u.user.name),
- m->u.kernel.match->name,
- strlen(m->u.kernel.match->name)+1)
- != 0) {
+ if (xt_match_to_user(m, userptr + off + i)) {
ret = -EFAULT;
goto free_counters;
}
}
t = ipt_get_target_c(e);
- if (copy_to_user(userptr + off + e->target_offset
- + offsetof(struct xt_entry_target,
- u.user.name),
- t->u.kernel.target->name,
- strlen(t->u.kernel.target->name)+1) != 0) {
+ if (xt_target_to_user(t, userptr + off + e->target_offset)) {
ret = -EFAULT;
goto free_counters;
}
@@ -977,7 +964,7 @@ static int get_info(struct net *net, void __user *user,
#endif
t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
"iptable_%s", name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
struct ipt_getinfo info;
const struct xt_table_info *private = t->private;
#ifdef CONFIG_COMPAT
@@ -1007,7 +994,7 @@ static int get_info(struct net *net, void __user *user,
xt_table_unlock(t);
module_put(t->me);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
#ifdef CONFIG_COMPAT
if (compat)
xt_compat_unlock(AF_INET);
@@ -1032,7 +1019,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, AF_INET, get.name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
const struct xt_table_info *private = t->private;
if (get.size == private->size)
ret = copy_entries_to_user(private->size,
@@ -1043,7 +1030,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
return ret;
}
@@ -1068,8 +1055,8 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
"iptable_%s", name);
- if (IS_ERR_OR_NULL(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ if (!t) {
+ ret = -ENOENT;
goto free_newinfo_counters_untrans;
}
@@ -1184,8 +1171,8 @@ do_add_counters(struct net *net, const void __user *user,
return PTR_ERR(paddc);
t = xt_find_table_lock(net, AF_INET, tmp.name);
- if (IS_ERR_OR_NULL(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ if (!t) {
+ ret = -ENOENT;
goto free;
}
@@ -1630,7 +1617,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
xt_compat_lock(AF_INET);
t = xt_find_table_lock(net, AF_INET, get.name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
ret = compat_table_info(private, &info);
@@ -1644,7 +1631,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
xt_compat_unlock(AF_INET);
return ret;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 4a9e6db9df8d..9b8841316e7b 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -62,7 +62,7 @@ struct clusterip_config {
static const struct file_operations clusterip_proc_fops;
#endif
-static int clusterip_net_id __read_mostly;
+static unsigned int clusterip_net_id __read_mostly;
struct clusterip_net {
struct list_head configs;
@@ -144,6 +144,11 @@ clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
rcu_read_lock_bh();
c = __clusterip_config_find(net, clusterip);
if (c) {
+#ifdef CONFIG_PROC_FS
+ if (!c->pde)
+ c = NULL;
+ else
+#endif
if (unlikely(!atomic_inc_not_zero(&c->refcount)))
c = NULL;
else if (entry)
@@ -166,14 +171,15 @@ clusterip_config_init_nodelist(struct clusterip_config *c,
static struct clusterip_config *
clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
- struct net_device *dev)
+ struct net_device *dev)
{
+ struct net *net = dev_net(dev);
struct clusterip_config *c;
- struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id);
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
c = kzalloc(sizeof(*c), GFP_ATOMIC);
if (!c)
- return NULL;
+ return ERR_PTR(-ENOMEM);
c->dev = dev;
c->clusterip = ip;
@@ -185,6 +191,17 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
atomic_set(&c->refcount, 1);
atomic_set(&c->entries, 1);
+ spin_lock_bh(&cn->lock);
+ if (__clusterip_config_find(net, ip)) {
+ spin_unlock_bh(&cn->lock);
+ kfree(c);
+
+ return ERR_PTR(-EBUSY);
+ }
+
+ list_add_rcu(&c->list, &cn->configs);
+ spin_unlock_bh(&cn->lock);
+
#ifdef CONFIG_PROC_FS
{
char buffer[16];
@@ -195,16 +212,16 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
cn->procdir,
&clusterip_proc_fops, c);
if (!c->pde) {
+ spin_lock_bh(&cn->lock);
+ list_del_rcu(&c->list);
+ spin_unlock_bh(&cn->lock);
kfree(c);
- return NULL;
+
+ return ERR_PTR(-ENOMEM);
}
}
#endif
- spin_lock_bh(&cn->lock);
- list_add_rcu(&c->list, &cn->configs);
- spin_unlock_bh(&cn->lock);
-
return c;
}
@@ -410,16 +427,16 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
config = clusterip_config_init(cipinfo,
e->ip.dst.s_addr, dev);
- if (!config) {
+ if (IS_ERR(config)) {
dev_put(dev);
- return -ENOMEM;
+ return PTR_ERR(config);
}
dev_mc_add(config->dev, config->clustermac);
}
}
cipinfo->config = config;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -444,7 +461,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
clusterip_config_put(cipinfo->config);
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
#ifdef CONFIG_COMPAT
@@ -468,6 +485,7 @@ static struct xt_target clusterip_tg_reg __read_mostly = {
.checkentry = clusterip_tg_check,
.destroy = clusterip_tg_destroy,
.targetsize = sizeof(struct ipt_clusterip_tgt_info),
+ .usersize = offsetof(struct ipt_clusterip_tgt_info, config),
#ifdef CONFIG_COMPAT
.compatsize = sizeof(struct compat_ipt_clusterip_tgt_info),
#endif /* CONFIG_COMPAT */
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index da7f02a0b868..a03e4e7ef5f9 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -41,7 +41,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par)
pr_debug("bad rangesize %u\n", mr->rangesize);
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(par->net, par->family);
}
static unsigned int
@@ -55,7 +55,13 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
range.min_proto = mr->range[0].min;
range.max_proto = mr->range[0].max;
- return nf_nat_masquerade_ipv4(skb, par->hooknum, &range, par->out);
+ return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range,
+ xt_out(par));
+}
+
+static void masquerade_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_target masquerade_tg_reg __read_mostly = {
@@ -66,6 +72,7 @@ static struct xt_target masquerade_tg_reg __read_mostly = {
.table = "nat",
.hooks = 1 << NF_INET_POST_ROUTING,
.checkentry = masquerade_tg_check,
+ .destroy = masquerade_tg_destroy,
.me = THIS_MODULE,
};
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 1d16c0f28df0..8bd0d7b26632 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -34,7 +34,7 @@ static unsigned int
reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ipt_reject_info *reject = par->targinfo;
- int hook = par->hooknum;
+ int hook = xt_hooknum(par);
switch (reject->with) {
case IPT_ICMP_NET_UNREACHABLE:
@@ -59,7 +59,7 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
break;
case IPT_TCP_RESET:
- nf_send_reset(par->net, skb, hook);
+ nf_send_reset(xt_net(par), skb, hook);
case IPT_ICMP_ECHOREPLY:
/* Doesn't happen. */
break;
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index db5b87509446..3240a2614e82 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -57,8 +57,7 @@ synproxy_send_tcp(struct net *net,
goto free_nskb;
if (nfct) {
- nskb->nfct = nfct;
- nskb->nfctinfo = ctinfo;
+ nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
nf_conntrack_get(nfct);
}
@@ -107,8 +106,8 @@ synproxy_send_client_synack(struct net *net,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
- niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
}
static void
@@ -230,8 +229,8 @@ synproxy_send_client_ack(struct net *net,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
- niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
}
static bool
@@ -263,12 +262,12 @@ static unsigned int
synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_synproxy_info *info = par->targinfo;
- struct net *net = par->net;
+ struct net *net = xt_net(par);
struct synproxy_net *snet = synproxy_pernet(net);
struct synproxy_options opts = {};
struct tcphdr *th, _th;
- if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP))
+ if (nf_ip_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP))
return NF_DROP;
th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
@@ -418,12 +417,12 @@ static int synproxy_tg4_check(const struct xt_tgchk_param *par)
e->ip.invflags & XT_INV_PROTO)
return -EINVAL;
- return nf_ct_l3proto_try_module_get(par->family);
+ return nf_ct_netns_get(par->net, par->family);
}
static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_target synproxy_tg4_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 78cc64eddfc1..37fb9552e858 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -63,10 +63,10 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
return dev_match || flags & XT_RPFILTER_LOOSE;
}
-static bool rpfilter_is_local(const struct sk_buff *skb)
+static bool
+rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in)
{
- const struct rtable *rt = skb_rtable(skb);
- return rt && (rt->rt_flags & RTCF_LOCAL);
+ return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK;
}
static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
@@ -79,14 +79,16 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
info = par->matchinfo;
invert = info->flags & XT_RPFILTER_INVERT;
- if (rpfilter_is_local(skb))
+ if (rpfilter_is_loopback(skb, xt_in(par)))
return true ^ invert;
iph = ip_hdr(skb);
- if (ipv4_is_multicast(iph->daddr)) {
- if (ipv4_is_zeronet(iph->saddr))
- return ipv4_is_local_multicast(iph->daddr) ^ invert;
+ if (ipv4_is_zeronet(iph->saddr)) {
+ if (ipv4_is_lbcast(iph->daddr) ||
+ ipv4_is_local_multicast(iph->daddr))
+ return true ^ invert;
}
+
flow.flowi4_iif = LOOPBACK_IFINDEX;
flow.daddr = iph->saddr;
flow.saddr = rpfilter_get_saddr(iph->daddr);
@@ -95,7 +97,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
flow.flowi4_tos = RT_TOS(iph->tos);
flow.flowi4_scope = RT_SCOPE_UNIVERSE;
- return rpfilter_lookup_reverse(par->net, &flow, par->in, info->flags) ^ invert;
+ return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert;
}
static int rpfilter_check(const struct xt_mtchk_param *par)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 713c09a74b90..2e14ed11a35c 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -31,6 +31,13 @@
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
#include <net/netfilter/nf_log.h>
+static int conntrack4_net_id __read_mostly;
+static DEFINE_MUTEX(register_ipv4_hooks);
+
+struct conntrack4_net {
+ unsigned int users;
+};
+
static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
struct nf_conntrack_tuple *tuple)
{
@@ -158,6 +165,10 @@ static unsigned int ipv4_conntrack_local(void *priv,
if (skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
+
+ if (ip_is_fragment(ip_hdr(skb))) /* IP_NODEFRAG setsockopt set */
+ return NF_ACCEPT;
+
return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
}
@@ -228,7 +239,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
}
if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
- pr_debug("SO_ORIGINAL_DST: len %d not %Zu\n",
+ pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
*len, sizeof(struct sockaddr_in));
return -EINVAL;
}
@@ -307,9 +318,42 @@ static struct nf_sockopt_ops so_getorigdst = {
.owner = THIS_MODULE,
};
-static int ipv4_init_net(struct net *net)
+static int ipv4_hooks_register(struct net *net)
{
- return 0;
+ struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
+ int err = 0;
+
+ mutex_lock(&register_ipv4_hooks);
+
+ cnet->users++;
+ if (cnet->users > 1)
+ goto out_unlock;
+
+ err = nf_defrag_ipv4_enable(net);
+ if (err) {
+ cnet->users = 0;
+ goto out_unlock;
+ }
+
+ err = nf_register_net_hooks(net, ipv4_conntrack_ops,
+ ARRAY_SIZE(ipv4_conntrack_ops));
+
+ if (err)
+ cnet->users = 0;
+ out_unlock:
+ mutex_unlock(&register_ipv4_hooks);
+ return err;
+}
+
+static void ipv4_hooks_unregister(struct net *net)
+{
+ struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
+
+ mutex_lock(&register_ipv4_hooks);
+ if (cnet->users && (--cnet->users == 0))
+ nf_unregister_net_hooks(net, ipv4_conntrack_ops,
+ ARRAY_SIZE(ipv4_conntrack_ops));
+ mutex_unlock(&register_ipv4_hooks);
}
struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
@@ -325,7 +369,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
.nlattr_to_tuple = ipv4_nlattr_to_tuple,
.nla_policy = ipv4_nla_policy,
#endif
- .init_net = ipv4_init_net,
+ .net_ns_get = ipv4_hooks_register,
+ .net_ns_put = ipv4_hooks_unregister,
.me = THIS_MODULE,
};
@@ -336,52 +381,50 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
MODULE_ALIAS("ip_conntrack");
MODULE_LICENSE("GPL");
+static struct nf_conntrack_l4proto *builtin_l4proto4[] = {
+ &nf_conntrack_l4proto_tcp4,
+ &nf_conntrack_l4proto_udp4,
+ &nf_conntrack_l4proto_icmp,
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+ &nf_conntrack_l4proto_dccp4,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ &nf_conntrack_l4proto_sctp4,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+ &nf_conntrack_l4proto_udplite4,
+#endif
+};
+
static int ipv4_net_init(struct net *net)
{
int ret = 0;
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp4);
- if (ret < 0) {
- pr_err("nf_conntrack_tcp4: pernet registration failed\n");
- goto out_tcp;
- }
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp4);
- if (ret < 0) {
- pr_err("nf_conntrack_udp4: pernet registration failed\n");
- goto out_udp;
- }
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmp);
- if (ret < 0) {
- pr_err("nf_conntrack_icmp4: pernet registration failed\n");
- goto out_icmp;
- }
+ ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
+ if (ret < 0)
+ return ret;
ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4);
if (ret < 0) {
pr_err("nf_conntrack_ipv4: pernet registration failed\n");
- goto out_ipv4;
+ nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
}
- return 0;
-out_ipv4:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
-out_icmp:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
-out_udp:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
-out_tcp:
return ret;
}
static void ipv4_net_exit(struct net *net)
{
nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
+ nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
}
static struct pernet_operations ipv4_net_ops = {
.init = ipv4_net_init,
.exit = ipv4_net_exit,
+ .id = &conntrack4_net_id,
+ .size = sizeof(struct conntrack4_net),
};
static int __init nf_conntrack_l3proto_ipv4_init(void)
@@ -389,7 +432,6 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
int ret = 0;
need_conntrack();
- nf_defrag_ipv4_enable();
ret = nf_register_sockopt(&so_getorigdst);
if (ret < 0) {
@@ -403,46 +445,21 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
goto cleanup_sockopt;
}
- ret = nf_register_hooks(ipv4_conntrack_ops,
- ARRAY_SIZE(ipv4_conntrack_ops));
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register hooks.\n");
+ ret = nf_ct_l4proto_register(builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
+ if (ret < 0)
goto cleanup_pernet;
- }
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp4);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register tcp4 proto.\n");
- goto cleanup_hooks;
- }
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp4);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register udp4 proto.\n");
- goto cleanup_tcp4;
- }
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmp);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register icmpv4 proto.\n");
- goto cleanup_udp4;
- }
ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);
if (ret < 0) {
pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n");
- goto cleanup_icmpv4;
+ goto cleanup_l4proto;
}
return ret;
- cleanup_icmpv4:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
- cleanup_udp4:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
- cleanup_tcp4:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
- cleanup_hooks:
- nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
+cleanup_l4proto:
+ nf_ct_l4proto_unregister(builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
cleanup_pernet:
unregister_pernet_subsys(&ipv4_net_ops);
cleanup_sockopt:
@@ -454,10 +471,8 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
{
synchronize_net();
nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
- nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
+ nf_ct_l4proto_unregister(builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
unregister_pernet_subsys(&ipv4_net_ops);
nf_unregister_sockopt(&so_getorigdst);
}
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index d075b3cf2400..73c591d8a9a8 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -128,16 +128,16 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
static int
icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
- enum ip_conntrack_info *ctinfo,
unsigned int hooknum)
{
struct nf_conntrack_tuple innertuple, origtuple;
const struct nf_conntrack_l4proto *innerproto;
const struct nf_conntrack_tuple_hash *h;
const struct nf_conntrack_zone *zone;
+ enum ip_conntrack_info ctinfo;
struct nf_conntrack_zone tmp;
- NF_CT_ASSERT(skb->nfct == NULL);
+ NF_CT_ASSERT(!skb_nfct(skb));
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
/* Are they talking about one of our connections? */
@@ -160,7 +160,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
return -NF_ACCEPT;
}
- *ctinfo = IP_CT_RELATED;
+ ctinfo = IP_CT_RELATED;
h = nf_conntrack_find_get(net, zone, &innertuple);
if (!h) {
@@ -169,11 +169,10 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
}
if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
- *ctinfo += IP_CT_IS_REPLY;
+ ctinfo += IP_CT_IS_REPLY;
/* Update skb to refer to this connection */
- skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
- skb->nfctinfo = *ctinfo;
+ nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
return NF_ACCEPT;
}
@@ -181,7 +180,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
static int
icmp_error(struct net *net, struct nf_conn *tmpl,
struct sk_buff *skb, unsigned int dataoff,
- enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
+ u8 pf, unsigned int hooknum)
{
const struct icmphdr *icmph;
struct icmphdr _ih;
@@ -225,7 +224,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
icmph->type != ICMP_REDIRECT)
return NF_ACCEPT;
- return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
+ return icmp_error_message(net, tmpl, skb, hooknum);
}
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index d88da36b383c..346bf7ccac08 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -11,6 +11,7 @@
#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/skbuff.h>
+#include <net/netns/generic.h>
#include <net/route.h>
#include <net/ip.h>
@@ -22,6 +23,8 @@
#endif
#include <net/netfilter/nf_conntrack_zones.h>
+static DEFINE_MUTEX(defrag4_mutex);
+
static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
u_int32_t user)
{
@@ -42,7 +45,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
{
u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- if (skb->nfct) {
+ if (skb_nfct(skb)) {
enum ip_conntrack_info ctinfo;
const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
@@ -72,7 +75,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
#if !IS_ENABLED(CONFIG_NF_NAT)
/* Previously seen (loopback)? Ignore. Do this before
fragment check. */
- if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+ if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
return NF_ACCEPT;
#endif
#endif
@@ -102,18 +105,50 @@ static struct nf_hook_ops ipv4_defrag_ops[] = {
},
};
+static void __net_exit defrag4_net_exit(struct net *net)
+{
+ if (net->nf.defrag_ipv4) {
+ nf_unregister_net_hooks(net, ipv4_defrag_ops,
+ ARRAY_SIZE(ipv4_defrag_ops));
+ net->nf.defrag_ipv4 = false;
+ }
+}
+
+static struct pernet_operations defrag4_net_ops = {
+ .exit = defrag4_net_exit,
+};
+
static int __init nf_defrag_init(void)
{
- return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
+ return register_pernet_subsys(&defrag4_net_ops);
}
static void __exit nf_defrag_fini(void)
{
- nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
+ unregister_pernet_subsys(&defrag4_net_ops);
}
-void nf_defrag_ipv4_enable(void)
+int nf_defrag_ipv4_enable(struct net *net)
{
+ int err = 0;
+
+ might_sleep();
+
+ if (net->nf.defrag_ipv4)
+ return 0;
+
+ mutex_lock(&defrag4_mutex);
+ if (net->nf.defrag_ipv4)
+ goto out_unlock;
+
+ err = nf_register_net_hooks(net, ipv4_defrag_ops,
+ ARRAY_SIZE(ipv4_defrag_ops));
+ if (err == 0)
+ net->nf.defrag_ipv4 = true;
+
+ out_unlock:
+ mutex_unlock(&defrag4_mutex);
+ return err;
}
EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable);
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index cf986e1c7bbd..f0dbff05fc28 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -68,10 +68,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* Avoid counting cloned packets towards the original connection. */
- nf_conntrack_put(skb->nfct);
- skb->nfct = &nf_ct_untracked_get()->ct_general;
- skb->nfctinfo = IP_CT_NEW;
- nf_conntrack_get(skb->nfct);
+ nf_reset(skb);
+ nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
+ nf_conntrack_get(skb_nfct(skb));
#endif
/*
* If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index b24795e2ee6d..2f3895ddc275 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -69,7 +69,7 @@ static void dump_arp_packet(struct nf_log_buf *m,
ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp);
if (ap == NULL) {
- nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]",
+ nf_log_buf_add(m, " INCOMPLETE [%zu bytes]",
skb->len - sizeof(_arph));
return;
}
@@ -87,7 +87,7 @@ static void nf_log_arp_packet(struct net *net, u_int8_t pf,
struct nf_log_buf *m;
/* FIXME: Disabled from containers until syslog ns is supported */
- if (!net_eq(net, &init_net))
+ if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
return;
m = nf_log_buf_open();
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 856648966f4c..c83a9963269b 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -319,7 +319,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
struct nf_log_buf *m;
/* FIXME: Disabled from containers until syslog ns is supported */
- if (!net_eq(net, &init_net))
+ if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
return;
m = nf_log_buf_open();
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index f8aad03d674b..6f5e8d01b876 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -255,11 +255,6 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
/* maniptype == SRC for postrouting. */
enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
- /* We never see fragments: conntrack defrags on pre-routing
- * and local-out, and nf_nat_out protects post-routing.
- */
- NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
-
ct = nf_ct_get(skb, &ctinfo);
/* Can't track? It's not due to stress, or conntrack would
* have dropped it. Hence it's the user's responsibilty to
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index c9b52c361da2..53e49f5011d3 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -1260,16 +1260,6 @@ static const struct nf_conntrack_expect_policy snmp_exp_policy = {
.timeout = 180,
};
-static struct nf_conntrack_helper snmp_helper __read_mostly = {
- .me = THIS_MODULE,
- .help = help,
- .expect_policy = &snmp_exp_policy,
- .name = "snmp",
- .tuple.src.l3num = AF_INET,
- .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT),
- .tuple.dst.protonum = IPPROTO_UDP,
-};
-
static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
.me = THIS_MODULE,
.help = help,
@@ -1288,22 +1278,16 @@ static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
static int __init nf_nat_snmp_basic_init(void)
{
- int ret = 0;
-
BUG_ON(nf_nat_snmp_hook != NULL);
RCU_INIT_POINTER(nf_nat_snmp_hook, help);
- ret = nf_conntrack_helper_register(&snmp_trap_helper);
- if (ret < 0) {
- nf_conntrack_helper_unregister(&snmp_helper);
- return ret;
- }
- return ret;
+ return nf_conntrack_helper_register(&snmp_trap_helper);
}
static void __exit nf_nat_snmp_basic_fini(void)
{
RCU_INIT_POINTER(nf_nat_snmp_hook, NULL);
+ synchronize_rcu();
nf_conntrack_helper_unregister(&snmp_trap_helper);
}
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index fd8220213afc..146d86105183 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -126,6 +126,8 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
/* ip_route_me_harder expects skb->dst to be set */
skb_dst_set_noref(nskb, skb_dst(oldskb));
+ nskb->mark = IP4_REPLY_MARK(net, oldskb->mark);
+
skb_reserve(nskb, LL_MAX_HEADER);
niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
ip4_dst_hoplimit(skb_dst(nskb)));
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
new file mode 100644
index 000000000000..a83d558e1aae
--- /dev/null
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2007-2008 BalaBit IT Ltd.
+ * Author: Krisztian Kovacs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <net/netfilter/nf_socket.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static int
+extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol,
+ __be32 *raddr, __be32 *laddr,
+ __be16 *rport, __be16 *lport)
+{
+ unsigned int outside_hdrlen = ip_hdrlen(skb);
+ struct iphdr *inside_iph, _inside_iph;
+ struct icmphdr *icmph, _icmph;
+ __be16 *ports, _ports[2];
+
+ icmph = skb_header_pointer(skb, outside_hdrlen,
+ sizeof(_icmph), &_icmph);
+ if (icmph == NULL)
+ return 1;
+
+ switch (icmph->type) {
+ case ICMP_DEST_UNREACH:
+ case ICMP_SOURCE_QUENCH:
+ case ICMP_REDIRECT:
+ case ICMP_TIME_EXCEEDED:
+ case ICMP_PARAMETERPROB:
+ break;
+ default:
+ return 1;
+ }
+
+ inside_iph = skb_header_pointer(skb, outside_hdrlen +
+ sizeof(struct icmphdr),
+ sizeof(_inside_iph), &_inside_iph);
+ if (inside_iph == NULL)
+ return 1;
+
+ if (inside_iph->protocol != IPPROTO_TCP &&
+ inside_iph->protocol != IPPROTO_UDP)
+ return 1;
+
+ ports = skb_header_pointer(skb, outside_hdrlen +
+ sizeof(struct icmphdr) +
+ (inside_iph->ihl << 2),
+ sizeof(_ports), &_ports);
+ if (ports == NULL)
+ return 1;
+
+ /* the inside IP packet is the one quoted from our side, thus
+ * its saddr is the local address */
+ *protocol = inside_iph->protocol;
+ *laddr = inside_iph->saddr;
+ *lport = ports[0];
+ *raddr = inside_iph->daddr;
+ *rport = ports[1];
+
+ return 0;
+}
+
+static struct sock *
+nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
+ const u8 protocol,
+ const __be32 saddr, const __be32 daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in)
+{
+ switch (protocol) {
+ case IPPROTO_TCP:
+ return inet_lookup(net, &tcp_hashinfo, skb, doff,
+ saddr, sport, daddr, dport,
+ in->ifindex);
+ case IPPROTO_UDP:
+ return udp4_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ }
+ return NULL;
+}
+
+struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
+ const struct net_device *indev)
+{
+ __be32 uninitialized_var(daddr), uninitialized_var(saddr);
+ __be16 uninitialized_var(dport), uninitialized_var(sport);
+ const struct iphdr *iph = ip_hdr(skb);
+ struct sk_buff *data_skb = NULL;
+ u8 uninitialized_var(protocol);
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn const *ct;
+#endif
+ int doff = 0;
+
+ if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) {
+ struct udphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, ip_hdrlen(skb),
+ sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return NULL;
+
+ protocol = iph->protocol;
+ saddr = iph->saddr;
+ sport = hp->source;
+ daddr = iph->daddr;
+ dport = hp->dest;
+ data_skb = (struct sk_buff *)skb;
+ doff = iph->protocol == IPPROTO_TCP ?
+ ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) :
+ ip_hdrlen(skb) + sizeof(*hp);
+
+ } else if (iph->protocol == IPPROTO_ICMP) {
+ if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
+ &sport, &dport))
+ return NULL;
+ } else {
+ return NULL;
+ }
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ /* Do the lookup with the original socket address in
+ * case this is a reply packet of an established
+ * SNAT-ted connection.
+ */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct && !nf_ct_is_untracked(ct) &&
+ ((iph->protocol != IPPROTO_ICMP &&
+ ctinfo == IP_CT_ESTABLISHED_REPLY) ||
+ (iph->protocol == IPPROTO_ICMP &&
+ ctinfo == IP_CT_RELATED_REPLY)) &&
+ (ct->status & IPS_SRC_NAT_DONE)) {
+
+ daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
+ dport = (iph->protocol == IPPROTO_TCP) ?
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port :
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
+ }
+#endif
+
+ return nf_socket_get_sock_v4(net, data_skb, doff, protocol, saddr,
+ daddr, sport, dport, indev);
+}
+EXPORT_SYMBOL_GPL(nf_sk_lookup_slow_v4);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
+MODULE_DESCRIPTION("Netfilter IPv4 socket lookup infrastructure");
diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c
index 0c01a270bf9f..0af3d8df70dd 100644
--- a/net/ipv4/netfilter/nft_dup_ipv4.c
+++ b/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -30,7 +30,7 @@ static void nft_dup_ipv4_eval(const struct nft_expr *expr,
};
int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1;
- nf_dup_ipv4(pkt->net, pkt->skb, pkt->hook, &gw, oif);
+ nf_dup_ipv4(nft_net(pkt), pkt->skb, nft_hook(pkt), &gw, oif);
}
static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
new file mode 100644
index 000000000000..2981291910dd
--- /dev/null
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -0,0 +1,236 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_fib.h>
+
+#include <net/ip_fib.h>
+#include <net/route.h>
+
+/* don't try to find route from mcast/bcast/zeronet */
+static __be32 get_saddr(__be32 addr)
+{
+ if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) ||
+ ipv4_is_zeronet(addr))
+ return 0;
+ return addr;
+}
+
+#define DSCP_BITS 0xfc
+
+void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ u32 *dst = &regs->data[priv->dreg];
+ const struct net_device *dev = NULL;
+ const struct iphdr *iph;
+ __be32 addr;
+
+ if (priv->flags & NFTA_FIB_F_IIF)
+ dev = nft_in(pkt);
+ else if (priv->flags & NFTA_FIB_F_OIF)
+ dev = nft_out(pkt);
+
+ iph = ip_hdr(pkt->skb);
+ if (priv->flags & NFTA_FIB_F_DADDR)
+ addr = iph->daddr;
+ else
+ addr = iph->saddr;
+
+ *dst = inet_dev_addr_type(nft_net(pkt), dev, addr);
+}
+EXPORT_SYMBOL_GPL(nft_fib4_eval_type);
+
+static int get_ifindex(const struct net_device *dev)
+{
+ return dev ? dev->ifindex : 0;
+}
+
+void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ const struct iphdr *iph;
+ struct fib_result res;
+ struct flowi4 fl4 = {
+ .flowi4_scope = RT_SCOPE_UNIVERSE,
+ .flowi4_iif = LOOPBACK_IFINDEX,
+ };
+ const struct net_device *oif;
+ struct net_device *found;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ int i;
+#endif
+
+ /*
+ * Do not set flowi4_oif, it restricts results (for example, asking
+ * for oif 3 will get RTN_UNICAST result even if the daddr exits
+ * on another interface.
+ *
+ * Search results for the desired outinterface instead.
+ */
+ if (priv->flags & NFTA_FIB_F_OIF)
+ oif = nft_out(pkt);
+ else if (priv->flags & NFTA_FIB_F_IIF)
+ oif = nft_in(pkt);
+ else
+ oif = NULL;
+
+ if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
+ nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
+ nft_fib_store_result(dest, priv->result, pkt,
+ nft_in(pkt)->ifindex);
+ return;
+ }
+
+ iph = ip_hdr(pkt->skb);
+ if (ipv4_is_zeronet(iph->saddr)) {
+ if (ipv4_is_lbcast(iph->daddr) ||
+ ipv4_is_local_multicast(iph->daddr)) {
+ nft_fib_store_result(dest, priv->result, pkt,
+ get_ifindex(pkt->skb->dev));
+ return;
+ }
+ }
+
+ if (priv->flags & NFTA_FIB_F_MARK)
+ fl4.flowi4_mark = pkt->skb->mark;
+
+ fl4.flowi4_tos = iph->tos & DSCP_BITS;
+
+ if (priv->flags & NFTA_FIB_F_DADDR) {
+ fl4.daddr = iph->daddr;
+ fl4.saddr = get_saddr(iph->saddr);
+ } else {
+ fl4.daddr = iph->saddr;
+ fl4.saddr = get_saddr(iph->daddr);
+ }
+
+ *dest = 0;
+
+ if (fib_lookup(nft_net(pkt), &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
+ return;
+
+ switch (res.type) {
+ case RTN_UNICAST:
+ break;
+ case RTN_LOCAL: /* Should not see RTN_LOCAL here */
+ return;
+ default:
+ break;
+ }
+
+ if (!oif) {
+ found = FIB_RES_DEV(res);
+ goto ok;
+ }
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ for (i = 0; i < res.fi->fib_nhs; i++) {
+ struct fib_nh *nh = &res.fi->fib_nh[i];
+
+ if (nh->nh_dev == oif) {
+ found = nh->nh_dev;
+ goto ok;
+ }
+ }
+ return;
+#else
+ found = FIB_RES_DEV(res);
+ if (found != oif)
+ return;
+#endif
+ok:
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF:
+ *dest = found->ifindex;
+ break;
+ case NFT_FIB_RESULT_OIFNAME:
+ strncpy((char *)dest, found->name, IFNAMSIZ);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(nft_fib4_eval);
+
+static struct nft_expr_type nft_fib4_type;
+
+static const struct nft_expr_ops nft_fib4_type_ops = {
+ .type = &nft_fib4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+ .eval = nft_fib4_eval_type,
+ .init = nft_fib_init,
+ .dump = nft_fib_dump,
+ .validate = nft_fib_validate,
+};
+
+static const struct nft_expr_ops nft_fib4_ops = {
+ .type = &nft_fib4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+ .eval = nft_fib4_eval,
+ .init = nft_fib_init,
+ .dump = nft_fib_dump,
+ .validate = nft_fib_validate,
+};
+
+static const struct nft_expr_ops *
+nft_fib4_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ enum nft_fib_result result;
+
+ if (!tb[NFTA_FIB_RESULT])
+ return ERR_PTR(-EINVAL);
+
+ result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
+
+ switch (result) {
+ case NFT_FIB_RESULT_OIF:
+ return &nft_fib4_ops;
+ case NFT_FIB_RESULT_OIFNAME:
+ return &nft_fib4_ops;
+ case NFT_FIB_RESULT_ADDRTYPE:
+ return &nft_fib4_type_ops;
+ default:
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+}
+
+static struct nft_expr_type nft_fib4_type __read_mostly = {
+ .name = "fib",
+ .select_ops = &nft_fib4_select_ops,
+ .policy = nft_fib_policy,
+ .maxattr = NFTA_FIB_MAX,
+ .family = NFPROTO_IPV4,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_fib4_module_init(void)
+{
+ return nft_register_expr(&nft_fib4_type);
+}
+
+static void __exit nft_fib4_module_exit(void)
+{
+ nft_unregister_expr(&nft_fib4_type);
+}
+
+module_init(nft_fib4_module_init);
+module_exit(nft_fib4_module_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_ALIAS_NFT_AF_EXPR(2, "fib");
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index 51ced81b616c..f18677277119 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -26,13 +26,19 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
memset(&range, 0, sizeof(range));
range.flags = priv->flags;
if (priv->sreg_proto_min) {
- range.min_proto.all =
- *(__be16 *)&regs->data[priv->sreg_proto_min];
- range.max_proto.all =
- *(__be16 *)&regs->data[priv->sreg_proto_max];
+ range.min_proto.all = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg_proto_min]);
+ range.max_proto.all = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg_proto_max]);
}
- regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook,
- &range, pkt->out);
+ regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt),
+ &range, nft_out(pkt));
+}
+
+static void
+nft_masq_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
}
static struct nft_expr_type nft_masq_ipv4_type;
@@ -41,6 +47,7 @@ static const struct nft_expr_ops nft_masq_ipv4_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
.eval = nft_masq_ipv4_eval,
.init = nft_masq_init,
+ .destroy = nft_masq_ipv4_destroy,
.dump = nft_masq_dump,
.validate = nft_masq_validate,
};
@@ -77,5 +84,5 @@ module_init(nft_masq_ipv4_module_init);
module_exit(nft_masq_ipv4_module_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq");
diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c
index c09d4381427e..5120be1d3118 100644
--- a/net/ipv4/netfilter/nft_redir_ipv4.c
+++ b/net/ipv4/netfilter/nft_redir_ipv4.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -26,17 +26,22 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr,
memset(&mr, 0, sizeof(mr));
if (priv->sreg_proto_min) {
- mr.range[0].min.all =
- *(__be16 *)&regs->data[priv->sreg_proto_min];
- mr.range[0].max.all =
- *(__be16 *)&regs->data[priv->sreg_proto_max];
+ mr.range[0].min.all = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg_proto_min]);
+ mr.range[0].max.all = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg_proto_max]);
mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
}
mr.range[0].flags |= priv->flags;
- regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr,
- pkt->hook);
+ regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt));
+}
+
+static void
+nft_redir_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
}
static struct nft_expr_type nft_redir_ipv4_type;
@@ -45,6 +50,7 @@ static const struct nft_expr_ops nft_redir_ipv4_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
.eval = nft_redir_ipv4_eval,
.init = nft_redir_init,
+ .destroy = nft_redir_ipv4_destroy,
.dump = nft_redir_dump,
.validate = nft_redir_validate,
};
@@ -72,5 +78,5 @@ module_init(nft_redir_ipv4_module_init);
module_exit(nft_redir_ipv4_module_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir");
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index 2c2553b9026c..517ce93699de 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -27,10 +27,10 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr,
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nf_send_unreach(pkt->skb, priv->icmp_code, pkt->hook);
+ nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset(pkt->net, pkt->skb, pkt->hook);
+ nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt));
break;
default:
break;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 96b8e2b95731..ccfbce13a633 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -156,17 +156,18 @@ int ping_hash(struct sock *sk)
void ping_unhash(struct sock *sk)
{
struct inet_sock *isk = inet_sk(sk);
+
pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
+ write_lock_bh(&ping_table.lock);
if (sk_hashed(sk)) {
- write_lock_bh(&ping_table.lock);
hlist_nulls_del(&sk->sk_nulls_node);
sk_nulls_node_init(&sk->sk_nulls_node);
sock_put(sk);
isk->inet_num = 0;
isk->inet_sport = 0;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
- write_unlock_bh(&ping_table.lock);
}
+ write_unlock_bh(&ping_table.lock);
}
EXPORT_SYMBOL_GPL(ping_unhash);
@@ -433,9 +434,9 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
goto out;
}
- pr_debug("after bind(): num = %d, dif = %d\n",
- (int)isk->inet_num,
- (int)sk->sk_bound_dev_if);
+ pr_debug("after bind(): num = %hu, dif = %d\n",
+ isk->inet_num,
+ sk->sk_bound_dev_if);
err = 0;
if (sk->sk_family == AF_INET && isk->inet_rcv_saddr)
@@ -609,15 +610,15 @@ int ping_getfrag(void *from, char *to,
fraglen -= sizeof(struct icmphdr);
if (fraglen < 0)
BUG();
- if (csum_and_copy_from_iter(to + sizeof(struct icmphdr),
+ if (!csum_and_copy_from_iter_full(to + sizeof(struct icmphdr),
fraglen, &pfh->wcheck,
- &pfh->msg->msg_iter) != fraglen)
+ &pfh->msg->msg_iter))
return -EFAULT;
} else if (offset < sizeof(struct icmphdr)) {
BUG();
} else {
- if (csum_and_copy_from_iter(to, fraglen, &pfh->wcheck,
- &pfh->msg->msg_iter) != fraglen)
+ if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck,
+ &pfh->msg->msg_iter))
return -EFAULT;
}
@@ -642,6 +643,8 @@ static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
{
struct sk_buff *skb = skb_peek(&sk->sk_write_queue);
+ if (!skb)
+ return 0;
pfh->wcheck = csum_partial((char *)&pfh->icmph,
sizeof(struct icmphdr), pfh->wcheck);
pfh->icmph.checksum = csum_fold(pfh->wcheck);
@@ -793,7 +796,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
- inet_sk_flowi_flags(sk), faddr, saddr, 0, 0);
+ inet_sk_flowi_flags(sk), faddr, saddr, 0, 0,
+ sk->sk_uid);
security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
rt = ip_route_output_flow(net, &fl4, sk);
@@ -847,7 +851,8 @@ out:
return err;
do_confirm:
- dst_confirm(&rt->dst);
+ if (msg->msg_flags & MSG_PROBE)
+ dst_confirm_neigh(&rt->dst, &fl4.daddr);
if (!(msg->msg_flags & MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 7143ca1a6af9..69cf49e8356d 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -57,15 +57,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
unsigned int frag_mem;
int orphans, sockets;
- local_bh_disable();
orphans = percpu_counter_sum_positive(&tcp_orphan_count);
sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
- local_bh_enable();
socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
sock_prot_inuse_get(net, &tcp_prot), orphans,
- atomic_read(&tcp_death_row.tw_count), sockets,
+ atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets,
proto_memory_allocated(&tcp_prot));
seq_printf(seq, "UDP: inuse %d mem %ld\n",
sock_prot_inuse_get(net, &udp_prot),
@@ -264,6 +262,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
+ SNMP_MIB_ITEM("PFMemallocDrop", LINUX_MIB_PFMEMALLOCDROP),
SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ecbe5a7c2d6d..9d943974de2b 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -41,7 +41,7 @@
#include <linux/atomic.h>
#include <asm/byteorder.h>
#include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <linux/stddef.h>
#include <linux/slab.h>
@@ -89,9 +89,10 @@ struct raw_frag_vec {
int hlen;
};
-static struct raw_hashinfo raw_v4_hashinfo = {
+struct raw_hashinfo raw_v4_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
};
+EXPORT_SYMBOL_GPL(raw_v4_hashinfo);
int raw_hash_sk(struct sock *sk)
{
@@ -120,7 +121,7 @@ void raw_unhash_sk(struct sock *sk)
}
EXPORT_SYMBOL_GPL(raw_unhash_sk);
-static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
+struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
unsigned short num, __be32 raddr, __be32 laddr, int dif)
{
sk_for_each_from(sk) {
@@ -136,6 +137,7 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
found:
return sk;
}
+EXPORT_SYMBOL_GPL(__raw_v4_lookup);
/*
* 0 - deliver
@@ -381,6 +383,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
+ if (flags & MSG_CONFIRM)
+ skb_set_dst_pending_confirm(skb, 1);
+
skb->transport_header = skb->network_header;
err = -EFAULT;
if (memcpy_from_msg(iph, msg, length))
@@ -604,7 +609,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk) |
(inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
- daddr, saddr, 0, 0);
+ daddr, saddr, 0, 0, sk->sk_uid);
if (!inet->hdrincl) {
rfv.msg = msg;
@@ -664,7 +669,8 @@ out:
return len;
do_confirm:
- dst_confirm(&rt->dst);
+ if (msg->msg_flags & MSG_PROBE)
+ dst_confirm_neigh(&rt->dst, &fl4.daddr);
if (!(msg->msg_flags & MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
@@ -676,7 +682,9 @@ static void raw_close(struct sock *sk, long timeout)
/*
* Raw sockets may have direct kernel references. Kill them.
*/
+ rtnl_lock();
ip_ra_control(sk, 0, NULL);
+ rtnl_unlock();
sk_common_release(sk);
}
@@ -693,12 +701,20 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ u32 tb_id = RT_TABLE_LOCAL;
int ret = -EINVAL;
int chk_addr_ret;
if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
goto out;
- chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
+
+ if (sk->sk_bound_dev_if)
+ tb_id = l3mdev_fib_table_by_index(sock_net(sk),
+ sk->sk_bound_dev_if) ? : tb_id;
+
+ chk_addr_ret = inet_addr_type_table(sock_net(sk), addr->sin_addr.s_addr,
+ tb_id);
+
ret = -EADDRNOTAVAIL;
if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
@@ -912,6 +928,20 @@ static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg
}
#endif
+int raw_abort(struct sock *sk, int err)
+{
+ lock_sock(sk);
+
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ __udp_disconnect(sk, 0);
+
+ release_sock(sk);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(raw_abort);
+
struct proto raw_prot = {
.name = "RAW",
.owner = THIS_MODULE,
@@ -937,6 +967,7 @@ struct proto raw_prot = {
.compat_getsockopt = compat_raw_getsockopt,
.compat_ioctl = compat_raw_ioctl,
#endif
+ .diag_destroy = raw_abort,
};
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
new file mode 100644
index 000000000000..e1a51ca68d23
--- /dev/null
+++ b/net/ipv4/raw_diag.c
@@ -0,0 +1,266 @@
+#include <linux/module.h>
+
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+
+#include <net/inet_sock.h>
+#include <net/raw.h>
+#include <net/rawv6.h>
+
+#ifdef pr_fmt
+# undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+static struct raw_hashinfo *
+raw_get_hashinfo(const struct inet_diag_req_v2 *r)
+{
+ if (r->sdiag_family == AF_INET) {
+ return &raw_v4_hashinfo;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (r->sdiag_family == AF_INET6) {
+ return &raw_v6_hashinfo;
+#endif
+ } else {
+ pr_warn_once("Unexpected inet family %d\n",
+ r->sdiag_family);
+ WARN_ON_ONCE(1);
+ return ERR_PTR(-EINVAL);
+ }
+}
+
+/*
+ * Due to requirement of not breaking user API we can't simply
+ * rename @pad field in inet_diag_req_v2 structure, instead
+ * use helper to figure it out.
+ */
+
+static struct sock *raw_lookup(struct net *net, struct sock *from,
+ const struct inet_diag_req_v2 *req)
+{
+ struct inet_diag_req_raw *r = (void *)req;
+ struct sock *sk = NULL;
+
+ if (r->sdiag_family == AF_INET)
+ sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol,
+ r->id.idiag_dst[0],
+ r->id.idiag_src[0],
+ r->id.idiag_if);
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol,
+ (const struct in6_addr *)r->id.idiag_src,
+ (const struct in6_addr *)r->id.idiag_dst,
+ r->id.idiag_if);
+#endif
+ return sk;
+}
+
+static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r)
+{
+ struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
+ struct sock *sk = NULL, *s;
+ int slot;
+
+ if (IS_ERR(hashinfo))
+ return ERR_CAST(hashinfo);
+
+ read_lock(&hashinfo->lock);
+ for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) {
+ sk_for_each(s, &hashinfo->ht[slot]) {
+ sk = raw_lookup(net, s, r);
+ if (sk) {
+ /*
+ * Grab it and keep until we fill
+ * diag meaage to be reported, so
+ * caller should call sock_put then.
+ * We can do that because we're keeping
+ * hashinfo->lock here.
+ */
+ sock_hold(sk);
+ goto out_unlock;
+ }
+ }
+ }
+out_unlock:
+ read_unlock(&hashinfo->lock);
+
+ return sk ? sk : ERR_PTR(-ENOENT);
+}
+
+static int raw_diag_dump_one(struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *r)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *rep;
+ struct sock *sk;
+ int err;
+
+ sk = raw_sock_get(net, r);
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
+
+ rep = nlmsg_new(sizeof(struct inet_diag_msg) +
+ sizeof(struct inet_diag_meminfo) + 64,
+ GFP_KERNEL);
+ if (!rep) {
+ sock_put(sk);
+ return -ENOMEM;
+ }
+
+ err = inet_sk_diag_fill(sk, NULL, rep, r,
+ sk_user_ns(NETLINK_CB(in_skb).sk),
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0, nlh,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
+ sock_put(sk);
+
+ if (err < 0) {
+ kfree_skb(rep);
+ return err;
+ }
+
+ err = netlink_unicast(net->diag_nlsk, rep,
+ NETLINK_CB(in_skb).portid,
+ MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+ return err;
+}
+
+static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ struct nlattr *bc, bool net_admin)
+{
+ if (!inet_diag_bc_sk(bc, sk))
+ return 0;
+
+ return inet_sk_diag_fill(sk, NULL, skb, r,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ cb->nlh, net_admin);
+}
+
+static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
+ struct net *net = sock_net(skb->sk);
+ int num, s_num, slot, s_slot;
+ struct sock *sk = NULL;
+
+ if (IS_ERR(hashinfo))
+ return;
+
+ s_slot = cb->args[0];
+ num = s_num = cb->args[1];
+
+ read_lock(&hashinfo->lock);
+ for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) {
+ num = 0;
+
+ sk_for_each(sk, &hashinfo->ht[slot]) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next;
+ if (sk->sk_family != r->sdiag_family)
+ goto next;
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next;
+ if (r->id.idiag_dport != inet->inet_dport &&
+ r->id.idiag_dport)
+ goto next;
+ if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0)
+ goto out_unlock;
+next:
+ num++;
+ }
+ }
+
+out_unlock:
+ read_unlock(&hashinfo->lock);
+
+ cb->args[0] = slot;
+ cb->args[1] = num;
+}
+
+static void raw_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *info)
+{
+ r->idiag_rqueue = sk_rmem_alloc_get(sk);
+ r->idiag_wqueue = sk_wmem_alloc_get(sk);
+}
+
+#ifdef CONFIG_INET_DIAG_DESTROY
+static int raw_diag_destroy(struct sk_buff *in_skb,
+ const struct inet_diag_req_v2 *r)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sock *sk;
+ int err;
+
+ sk = raw_sock_get(net, r);
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
+ err = sock_diag_destroy(sk, ECONNABORTED);
+ sock_put(sk);
+ return err;
+}
+#endif
+
+static const struct inet_diag_handler raw_diag_handler = {
+ .dump = raw_diag_dump,
+ .dump_one = raw_diag_dump_one,
+ .idiag_get_info = raw_diag_get_info,
+ .idiag_type = IPPROTO_RAW,
+ .idiag_info_size = 0,
+#ifdef CONFIG_INET_DIAG_DESTROY
+ .destroy = raw_diag_destroy,
+#endif
+};
+
+static void __always_unused __check_inet_diag_req_raw(void)
+{
+ /*
+ * Make sure the two structures are identical,
+ * except the @pad field.
+ */
+#define __offset_mismatch(m1, m2) \
+ (offsetof(struct inet_diag_req_v2, m1) != \
+ offsetof(struct inet_diag_req_raw, m2))
+
+ BUILD_BUG_ON(sizeof(struct inet_diag_req_v2) !=
+ sizeof(struct inet_diag_req_raw));
+ BUILD_BUG_ON(__offset_mismatch(sdiag_family, sdiag_family));
+ BUILD_BUG_ON(__offset_mismatch(sdiag_protocol, sdiag_protocol));
+ BUILD_BUG_ON(__offset_mismatch(idiag_ext, idiag_ext));
+ BUILD_BUG_ON(__offset_mismatch(pad, sdiag_raw_protocol));
+ BUILD_BUG_ON(__offset_mismatch(idiag_states, idiag_states));
+ BUILD_BUG_ON(__offset_mismatch(id, id));
+#undef __offset_mismatch
+}
+
+static int __init raw_diag_init(void)
+{
+ return inet_diag_register(&raw_diag_handler);
+}
+
+static void __exit raw_diag_exit(void)
+{
+ inet_diag_unregister(&raw_diag_handler);
+}
+
+module_init(raw_diag_init);
+module_exit(raw_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2a57566e6e91..acd69cfe2951 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -65,7 +65,7 @@
#define pr_fmt(fmt) "IPv4: " fmt
#include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -154,6 +154,7 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
struct sk_buff *skb,
const void *daddr);
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
static struct dst_ops ipv4_dst_ops = {
.family = AF_INET,
@@ -168,6 +169,7 @@ static struct dst_ops ipv4_dst_ops = {
.redirect = ip_do_redirect,
.local_out = __ip_local_out,
.neigh_lookup = ipv4_neigh_lookup,
+ .confirm_neigh = ipv4_confirm_neigh,
};
#define ECN_OR_COST(class) TC_PRIO_##class
@@ -461,6 +463,23 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
return neigh_create(&arp_tbl, pkey, dev);
}
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+ struct net_device *dev = dst->dev;
+ const __be32 *pkey = daddr;
+ const struct rtable *rt;
+
+ rt = (const struct rtable *)dst;
+ if (rt->rt_gateway)
+ pkey = (const __be32 *)&rt->rt_gateway;
+ else if (!daddr ||
+ (rt->rt_flags &
+ (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
+ return;
+
+ __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
+}
+
#define IP_IDENTS_SZ 2048u
static atomic_t *ip_idents __read_mostly;
@@ -507,7 +526,8 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
}
EXPORT_SYMBOL(__ip_select_ident);
-static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
+ const struct sock *sk,
const struct iphdr *iph,
int oif, u8 tos,
u8 prot, u32 mark, int flow_flags)
@@ -523,19 +543,21 @@ static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
flowi4_init_output(fl4, oif, mark, tos,
RT_SCOPE_UNIVERSE, prot,
flow_flags,
- iph->daddr, iph->saddr, 0, 0);
+ iph->daddr, iph->saddr, 0, 0,
+ sock_net_uid(net, sk));
}
static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
const struct sock *sk)
{
+ const struct net *net = dev_net(skb->dev);
const struct iphdr *iph = ip_hdr(skb);
int oif = skb->dev->ifindex;
u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
u32 mark = skb->mark;
- __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
+ __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
}
static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
@@ -552,7 +574,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk),
- daddr, inet->inet_saddr, 0, 0);
+ daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
rcu_read_unlock();
}
@@ -795,6 +817,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
struct rtable *rt;
struct flowi4 fl4;
const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct net *net = dev_net(skb->dev);
int oif = skb->dev->ifindex;
u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
@@ -802,7 +825,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
rt = (struct rtable *) dst;
- __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
+ __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
__ip_do_redirect(rt, skb, &fl4, true);
}
@@ -1020,7 +1043,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
if (!mark)
mark = IP4_REPLY_MARK(net, skb->mark);
- __build_flow_key(&fl4, NULL, iph, oif,
+ __build_flow_key(net, &fl4, NULL, iph, oif,
RT_TOS(iph->tos), protocol, mark, flow_flags);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
@@ -1036,7 +1059,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
struct flowi4 fl4;
struct rtable *rt;
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+ __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
if (!fl4.flowi4_mark)
fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
@@ -1055,6 +1078,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
struct rtable *rt;
struct dst_entry *odst = NULL;
bool new = false;
+ struct net *net = sock_net(sk);
bh_lock_sock(sk);
@@ -1068,7 +1092,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
goto out;
}
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+ __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
rt = (struct rtable *)odst;
if (odst->obsolete && !odst->ops->check(odst, 0)) {
@@ -1108,7 +1132,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
struct flowi4 fl4;
struct rtable *rt;
- __build_flow_key(&fl4, NULL, iph, oif,
+ __build_flow_key(net, &fl4, NULL, iph, oif,
RT_TOS(iph->tos), protocol, mark, flow_flags);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
@@ -1123,9 +1147,10 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
const struct iphdr *iph = (const struct iphdr *) skb->data;
struct flowi4 fl4;
struct rtable *rt;
+ struct net *net = sock_net(sk);
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
- rt = __ip_route_output_key(sock_net(sk), &fl4);
+ __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
+ rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
__ip_do_redirect(rt, skb, &fl4, false);
ip_rt_put(rt);
@@ -1598,6 +1623,19 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
spin_unlock_bh(&fnhe_lock);
}
+static void set_lwt_redirect(struct rtable *rth)
+{
+ if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
+ rth->dst.lwtstate->orig_output = rth->dst.output;
+ rth->dst.output = lwtunnel_output;
+ }
+
+ if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+ rth->dst.lwtstate->orig_input = rth->dst.input;
+ rth->dst.input = lwtunnel_input;
+ }
+}
+
/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
const struct fib_result *res,
@@ -1687,14 +1725,7 @@ rt_cache:
rth->dst.input = ip_forward;
rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
- if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
- rth->dst.lwtstate->orig_output = rth->dst.output;
- rth->dst.output = lwtunnel_output;
- }
- if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
- rth->dst.lwtstate->orig_input = rth->dst.input;
- rth->dst.input = lwtunnel_input;
- }
+ set_lwt_redirect(rth);
skb_dst_set(skb, &rth->dst);
out:
err = 0;
@@ -1746,7 +1777,6 @@ standard_hash:
static int ip_mkroute_input(struct sk_buff *skb,
struct fib_result *res,
- const struct flowi4 *fl4,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
{
@@ -1846,6 +1876,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
fl4.flowi4_flags = 0;
fl4.daddr = daddr;
fl4.saddr = saddr;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
err = fib_lookup(net, &fl4, &res, 0);
if (err != 0) {
if (!IN_DEV_FORWARD(in_dev))
@@ -1871,7 +1902,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (res.type != RTN_UNICAST)
goto martian_destination;
- err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
+ err = ip_mkroute_input(skb, &res, in_dev, daddr, saddr, tos);
out: return err;
brd_input:
@@ -1902,7 +1933,8 @@ local_input:
}
}
- rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
+ rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
+ flags | RTCF_LOCAL, res.type,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
if (!rth)
goto e_nobufs;
@@ -1921,8 +1953,18 @@ local_input:
rth->dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
+
if (do_cache) {
- if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
+ struct fib_nh *nh = &FIB_RES_NH(res);
+
+ rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
+ if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+ WARN_ON(rth->dst.input == lwtunnel_input);
+ rth->dst.lwtstate->orig_input = rth->dst.input;
+ rth->dst.input = lwtunnel_input;
+ }
+
+ if (unlikely(!rt_cache_route(nh, rth))) {
rth->dst.flags |= DST_NOCACHE;
rt_add_uncached_list(rth);
}
@@ -1967,6 +2009,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
{
int res;
+ tos &= IPTOS_RT_MASK;
rcu_read_lock();
/* Multicast recognition logic is moved from route cache to here.
@@ -1982,25 +2025,35 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
*/
if (ipv4_is_multicast(daddr)) {
struct in_device *in_dev = __in_dev_get_rcu(dev);
+ int our = 0;
- if (in_dev) {
- int our = ip_check_mc_rcu(in_dev, daddr, saddr,
- ip_hdr(skb)->protocol);
- if (our
+ if (in_dev)
+ our = ip_check_mc_rcu(in_dev, daddr, saddr,
+ ip_hdr(skb)->protocol);
+
+ /* check l3 master if no match yet */
+ if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
+ struct in_device *l3_in_dev;
+
+ l3_in_dev = __in_dev_get_rcu(skb->dev);
+ if (l3_in_dev)
+ our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
+ ip_hdr(skb)->protocol);
+ }
+
+ res = -EINVAL;
+ if (our
#ifdef CONFIG_IP_MROUTE
- ||
- (!ipv4_is_local_multicast(daddr) &&
- IN_DEV_MFORWARD(in_dev))
+ ||
+ (!ipv4_is_local_multicast(daddr) &&
+ IN_DEV_MFORWARD(in_dev))
#endif
- ) {
- int res = ip_route_input_mc(skb, daddr, saddr,
- tos, dev, our);
- rcu_read_unlock();
- return res;
- }
+ ) {
+ res = ip_route_input_mc(skb, daddr, saddr,
+ tos, dev, our);
}
rcu_read_unlock();
- return -EINVAL;
+ return res;
}
res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
rcu_read_unlock();
@@ -2140,8 +2193,7 @@ add:
}
rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
- if (lwtunnel_output_redirect(rth->dst.lwtstate))
- rth->dst.output = lwtunnel_output;
+ set_lwt_redirect(rth);
return rth;
}
@@ -2268,7 +2320,8 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
res.fi = NULL;
res.table = NULL;
if (fl4->flowi4_oif &&
- !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
+ (ipv4_is_multicast(fl4->daddr) ||
+ !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
/* Apparently, routing tables are wrong. Assume,
that the destination is on link.
@@ -2421,7 +2474,7 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow);
static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
- u32 seq, int event, int nowait, unsigned int flags)
+ u32 seq, int event)
{
struct rtable *rt = skb_rtable(skb);
struct rtmsg *r;
@@ -2430,7 +2483,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
u32 error;
u32 metrics[RTAX_MAX];
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), 0);
if (!nlh)
return -EMSGSIZE;
@@ -2439,7 +2492,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
r->rtm_tos = fl4->flowi4_tos;
- r->rtm_table = table_id;
+ r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
if (nla_put_u32(skb, RTA_TABLE, table_id))
goto nla_put_failure;
r->rtm_type = rt->rt_type;
@@ -2495,6 +2548,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
goto nla_put_failure;
+ if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
+ nla_put_u32(skb, RTA_UID,
+ from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
+ goto nla_put_failure;
+
error = rt->dst.error;
if (rt_is_input_route(rt)) {
@@ -2503,18 +2561,12 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
int err = ipmr_get_route(net, skb,
fl4->saddr, fl4->daddr,
- r, nowait, portid);
+ r, portid);
if (err <= 0) {
- if (!nowait) {
- if (err == 0)
- return 0;
- goto nla_put_failure;
- } else {
- if (err == -EMSGSIZE)
- goto nla_put_failure;
- error = err;
- }
+ if (err == 0)
+ return 0;
+ goto nla_put_failure;
}
} else
#endif
@@ -2547,6 +2599,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
int mark;
struct sk_buff *skb;
u32 table_id = RT_TABLE_MAIN;
+ kuid_t uid;
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
if (err < 0)
@@ -2567,13 +2620,17 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
skb_reset_network_header(skb);
/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
- ip_hdr(skb)->protocol = IPPROTO_ICMP;
+ ip_hdr(skb)->protocol = IPPROTO_UDP;
skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
+ if (tb[RTA_UID])
+ uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
+ else
+ uid = (iif ? INVALID_UID : current_uid());
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = dst;
@@ -2581,6 +2638,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
fl4.flowi4_tos = rtm->rtm_tos;
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
+ fl4.flowi4_uid = uid;
if (iif) {
struct net_device *dev;
@@ -2594,9 +2652,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
skb->protocol = htons(ETH_P_IP);
skb->dev = dev;
skb->mark = mark;
- local_bh_disable();
err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
- local_bh_enable();
rt = skb_rtable(skb);
if (err == 0 && rt->dst.error)
@@ -2621,7 +2677,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
- RTM_NEWROUTE, 0, 0);
+ RTM_NEWROUTE);
if (err < 0)
goto errout_free;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index e3c4043c27de..496b97e17aaf 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -13,13 +13,13 @@
#include <linux/tcp.h>
#include <linux/slab.h>
#include <linux/random.h>
-#include <linux/cryptohash.h>
+#include <linux/siphash.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <net/tcp.h>
#include <net/route.h>
-static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
+static siphash_key_t syncookie_secret[2] __read_mostly;
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
@@ -48,24 +48,13 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
#define TSBITS 6
#define TSMASK (((__u32)1 << TSBITS) - 1)
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch);
-
static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 count, int c)
{
- __u32 *tmp;
-
net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
-
- tmp = this_cpu_ptr(ipv4_cookie_scratch);
- memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
- tmp[0] = (__force u32)saddr;
- tmp[1] = (__force u32)daddr;
- tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
- tmp[3] = count;
- sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
-
- return tmp[17];
+ return siphash_4u32((__force u32)saddr, (__force u32)daddr,
+ (__force u32)sport << 16 | (__force u32)dport,
+ count, &syncookie_secret[c]);
}
@@ -334,6 +323,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
treq = tcp_rsk(req);
treq->rcv_isn = ntohl(th->seq) - 1;
treq->snt_isn = cookie;
+ treq->ts_off = 0;
req->mss = mss;
ireq->ir_num = ntohs(th->dest);
ireq->ir_rmt_port = th->source;
@@ -372,7 +362,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
inet_sk_flowi_flags(sk),
opt->srr ? opt->faddr : ireq->ir_rmt_addr,
- ireq->ir_loc_addr, th->source, th->dest);
+ ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(sock_net(sk), &fl4);
if (IS_ERR(rt)) {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 80bc36b25de2..d6880a6149ee 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -35,6 +35,8 @@ static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
static int tcp_adv_win_scale_min = -31;
static int tcp_adv_win_scale_max = 31;
+static int ip_privileged_port_min;
+static int ip_privileged_port_max = 65535;
static int ip_ttl_min = 1;
static int ip_ttl_max = 255;
static int tcp_syn_retries_min = 1;
@@ -79,7 +81,12 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (write && ret == 0) {
- if (range[1] < range[0])
+ /* Ensure that the upper limit is not smaller than the lower,
+ * and that the lower does not encroach upon the privileged
+ * port limit.
+ */
+ if ((range[1] < range[0]) ||
+ (range[0] < net->ipv4.sysctl_ip_prot_sock))
ret = -EINVAL;
else
set_local_port_range(net, range);
@@ -88,6 +95,40 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
return ret;
}
+/* Validate changes from /proc interface. */
+static int ipv4_privileged_ports(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_ip_prot_sock);
+ int ret;
+ int pports;
+ int range[2];
+ struct ctl_table tmp = {
+ .data = &pports,
+ .maxlen = sizeof(pports),
+ .mode = table->mode,
+ .extra1 = &ip_privileged_port_min,
+ .extra2 = &ip_privileged_port_max,
+ };
+
+ pports = net->ipv4.sysctl_ip_prot_sock;
+
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ inet_get_local_port_range(net, &range[0], &range[1]);
+ /* Ensure that the local port range doesn't overlap with the
+ * privileged port range.
+ */
+ if (range[0] < pports)
+ ret = -EINVAL;
+ else
+ net->ipv4.sysctl_ip_prot_sock = pports;
+ }
+
+ return ret;
+}
static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
{
@@ -290,13 +331,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_max_tw_buckets",
- .data = &tcp_death_row.sysctl_max_tw_buckets,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_fastopen",
.data = &sysctl_tcp_fastopen,
.maxlen = sizeof(int),
@@ -310,13 +344,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_tcp_fastopen_key,
},
{
- .procname = "tcp_tw_recycle",
- .data = &tcp_death_row.sysctl_tw_recycle,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_abort_on_overflow",
.data = &sysctl_tcp_abort_on_overflow,
.maxlen = sizeof(int),
@@ -338,13 +365,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_max_syn_backlog",
- .data = &sysctl_max_syn_backlog,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "inet_peer_threshold",
.data = &inet_peer_threshold,
.maxlen = sizeof(int),
@@ -433,13 +453,6 @@ static struct ctl_table ipv4_table[] = {
.extra2 = &tcp_adv_win_scale_max,
},
{
- .procname = "tcp_tw_reuse",
- .data = &sysctl_tcp_tw_reuse,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_frto",
.data = &sysctl_tcp_frto,
.maxlen = sizeof(int),
@@ -565,13 +578,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_thin_dupack",
- .data = &sysctl_tcp_thin_dupack,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_early_retrans",
.data = &sysctl_tcp_early_retrans,
.maxlen = sizeof(int),
@@ -958,7 +964,35 @@ static struct ctl_table ipv4_net_table[] = {
.data = &init_net.ipv4.sysctl_tcp_notsent_lowat,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_douintvec,
+ },
+ {
+ .procname = "tcp_tw_reuse",
+ .data = &init_net.ipv4.sysctl_tcp_tw_reuse,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_max_tw_buckets",
+ .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_tw_recycle",
+ .data = &init_net.ipv4.tcp_death_row.sysctl_tw_recycle,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_max_syn_backlog",
+ .data = &init_net.ipv4.sysctl_max_syn_backlog,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
},
#ifdef CONFIG_IP_ROUTE_MULTIPATH
{
@@ -971,6 +1005,24 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = &one,
},
#endif
+ {
+ .procname = "ip_unprivileged_port_start",
+ .maxlen = sizeof(int),
+ .data = &init_net.ipv4.sysctl_ip_prot_sock,
+ .mode = 0644,
+ .proc_handler = ipv4_privileged_ports,
+ },
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ {
+ .procname = "udp_l3mdev_accept",
+ .data = &init_net.ipv4.sysctl_udp_l3mdev_accept,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{ }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 814af89c1bd3..40ba4249a586 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -277,9 +277,8 @@
#include <net/ip.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
-#include <asm/unaligned.h>
#include <net/busy_poll.h>
int sysctl_tcp_min_tso_segs __read_mostly = 2;
@@ -405,10 +404,8 @@ void tcp_init_sock(struct sock *sk)
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = TCP_MSS_DEFAULT;
- u64_stats_init(&tp->syncp);
tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
- tcp_enable_early_retrans(tp);
tcp_assign_congestion_control(sk);
tp->tsoffset = 0;
@@ -423,15 +420,13 @@ void tcp_init_sock(struct sock *sk)
sk->sk_sndbuf = sysctl_tcp_wmem[1];
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
- local_bh_disable();
sk_sockets_allocated_inc(sk);
- local_bh_enable();
}
EXPORT_SYMBOL(tcp_init_sock);
static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
{
- if (tsflags) {
+ if (tsflags && skb) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -538,6 +533,12 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (tp->urg_data & TCP_URG_VALID)
mask |= POLLPRI;
+ } else if (sk->sk_state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+ /* Active TCP fastopen socket with defer_connect
+ * Return POLLOUT so application can call write()
+ * in order for kernel to generate SYN+data
+ */
+ mask |= POLLOUT | POLLWRNORM;
}
/* This barrier is coupled with smp_wmb() in tcp_reset() */
smp_rmb();
@@ -665,9 +666,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
if (tcp_should_autocork(sk, skb, size_goal)) {
/* avoid atomic op if TSQ_THROTTLED bit is already set */
- if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
+ if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
- set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
}
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED.
@@ -772,6 +773,12 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
ret = -EAGAIN;
break;
}
+ /* if __tcp_splice_read() got nothing while we have
+ * an skb in receive queue, we do not want to loop.
+ * This might happen with URG data.
+ */
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ break;
sk_wait_data(sk, &timeo, NULL);
if (signal_pending(current)) {
ret = sock_intr_errno(timeo);
@@ -960,10 +967,8 @@ new_segment:
copied += copy;
offset += copy;
size -= copy;
- if (!size) {
- tcp_tx_timestamp(sk, sk->sk_tsflags, skb);
+ if (!size)
goto out;
- }
if (skb->len < size_goal || (flags & MSG_OOB))
continue;
@@ -989,8 +994,11 @@ wait_for_memory:
}
out:
- if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
- tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+ if (copied) {
+ tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk));
+ if (!(flags & MSG_SENDPAGE_NOTLAST))
+ tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+ }
return copied;
do_error:
@@ -998,8 +1006,11 @@ do_error:
goto out;
out_err:
/* make sure we wake any epoll edge trigger waiter */
- if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
+ err == -EAGAIN)) {
sk->sk_write_space(sk);
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+ }
return sk_stream_error(sk, flags, err);
}
@@ -1072,6 +1083,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
int *copied, size_t size)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
int err, flags;
if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
@@ -1086,11 +1098,26 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
tp->fastopen_req->data = msg;
tp->fastopen_req->size = size;
+ if (inet->defer_connect) {
+ err = tcp_connect(sk);
+ /* Same failure procedure as in tcp_v4/6_connect */
+ if (err) {
+ tcp_set_state(sk, TCP_CLOSE);
+ inet->inet_dport = 0;
+ sk->sk_route_caps = 0;
+ }
+ }
flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
- msg->msg_namelen, flags);
- *copied = tp->fastopen_req->copied;
- tcp_free_fastopen_req(tp);
+ msg->msg_namelen, flags, 1);
+ /* fastopen_req could already be freed in __inet_stream_connect
+ * if the connection times out or gets rst
+ */
+ if (tp->fastopen_req) {
+ *copied = tp->fastopen_req->copied;
+ tcp_free_fastopen_req(tp);
+ inet->defer_connect = 0;
+ }
return err;
}
@@ -1108,7 +1135,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
lock_sock(sk);
flags = msg->msg_flags;
- if (flags & MSG_FASTOPEN) {
+ if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
if (err == -EINPROGRESS && copied_syn > 0)
goto out;
@@ -1266,7 +1293,7 @@ new_segment:
} else {
skb_fill_page_desc(skb, i, pfrag->page,
pfrag->offset, copy);
- get_page(pfrag->page);
+ page_ref_inc(pfrag->page);
}
pfrag->offset += copy;
}
@@ -1280,7 +1307,6 @@ new_segment:
copied += copy;
if (!msg_data_left(msg)) {
- tcp_tx_timestamp(sk, sockc.tsflags, skb);
if (unlikely(flags & MSG_EOR))
TCP_SKB_CB(skb)->eor = 1;
goto out;
@@ -1311,8 +1337,10 @@ wait_for_memory:
}
out:
- if (copied)
+ if (copied) {
+ tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk));
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+ }
out_nopush:
release_sock(sk);
return copied + copied_syn;
@@ -1333,8 +1361,11 @@ do_error:
out_err:
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
- if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
+ err == -EAGAIN)) {
sk->sk_write_space(sk);
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+ }
release_sock(sk);
return err;
}
@@ -2291,6 +2322,11 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_init_send_head(sk);
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);
+ tcp_saved_syn_free(tp);
+
+ /* Clean up fastopen related fields */
+ tcp_free_fastopen_req(tp);
+ inet->defer_connect = 0;
WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
@@ -2302,7 +2338,7 @@ EXPORT_SYMBOL(tcp_disconnect);
static inline bool tcp_can_repair_sock(const struct sock *sk)
{
return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
- ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
+ (sk->sk_state != TCP_LISTEN);
}
static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
@@ -2469,11 +2505,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_THIN_DUPACK:
if (val < 0 || val > 1)
err = -EINVAL;
- else {
- tp->thin_dupack = val;
- if (tp->thin_dupack)
- tcp_disable_early_retrans(tp);
- }
break;
case TCP_REPAIR:
@@ -2658,6 +2689,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
err = -EINVAL;
}
break;
+ case TCP_FASTOPEN_CONNECT:
+ if (val > 1 || val < 0) {
+ err = -EINVAL;
+ } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+ if (sk->sk_state == TCP_CLOSE)
+ tp->fastopen_connect = val;
+ else
+ err = -EINVAL;
+ } else {
+ err = -EOPNOTSUPP;
+ }
+ break;
case TCP_TIMESTAMP:
if (!tp->repair)
err = -EPERM;
@@ -2704,15 +2747,33 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
EXPORT_SYMBOL(compat_tcp_setsockopt);
#endif
+static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
+ struct tcp_info *info)
+{
+ u64 stats[__TCP_CHRONO_MAX], total = 0;
+ enum tcp_chrono i;
+
+ for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
+ stats[i] = tp->chrono_stat[i - 1];
+ if (i == tp->chrono_type)
+ stats[i] += tcp_time_stamp - tp->chrono_start;
+ stats[i] *= USEC_PER_SEC / HZ;
+ total += stats[i];
+ }
+
+ info->tcpi_busy_time = total;
+ info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
+ info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
+}
+
/* Return information about state of tcp endpoint in API format. */
void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 now = tcp_time_stamp, intv;
- unsigned int start;
- int notsent_bytes;
+ u32 now, intv;
u64 rate64;
+ bool slow;
u32 rate;
memset(info, 0, sizeof(*info));
@@ -2721,6 +2782,30 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_state = sk_state_load(sk);
+ /* Report meaningful fields for all TCP states, including listeners */
+ rate = READ_ONCE(sk->sk_pacing_rate);
+ rate64 = rate != ~0U ? rate : ~0ULL;
+ info->tcpi_pacing_rate = rate64;
+
+ rate = READ_ONCE(sk->sk_max_pacing_rate);
+ rate64 = rate != ~0U ? rate : ~0ULL;
+ info->tcpi_max_pacing_rate = rate64;
+
+ info->tcpi_reordering = tp->reordering;
+ info->tcpi_snd_cwnd = tp->snd_cwnd;
+
+ if (info->tcpi_state == TCP_LISTEN) {
+ /* listeners aliased fields :
+ * tcpi_unacked -> Number of children ready for accept()
+ * tcpi_sacked -> max backlog
+ */
+ info->tcpi_unacked = sk->sk_ack_backlog;
+ info->tcpi_sacked = sk->sk_max_ack_backlog;
+ return;
+ }
+
+ slow = lock_sock_fast(sk);
+
info->tcpi_ca_state = icsk->icsk_ca_state;
info->tcpi_retransmits = icsk->icsk_retransmits;
info->tcpi_probes = icsk->icsk_probes_out;
@@ -2748,17 +2833,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
- if (info->tcpi_state == TCP_LISTEN) {
- info->tcpi_unacked = sk->sk_ack_backlog;
- info->tcpi_sacked = sk->sk_max_ack_backlog;
- } else {
- info->tcpi_unacked = tp->packets_out;
- info->tcpi_sacked = tp->sacked_out;
- }
+ info->tcpi_unacked = tp->packets_out;
+ info->tcpi_sacked = tp->sacked_out;
+
info->tcpi_lost = tp->lost_out;
info->tcpi_retrans = tp->retrans_out;
info->tcpi_fackets = tp->fackets_out;
+ now = tcp_time_stamp;
info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
@@ -2768,34 +2850,21 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_rtt = tp->srtt_us >> 3;
info->tcpi_rttvar = tp->mdev_us >> 2;
info->tcpi_snd_ssthresh = tp->snd_ssthresh;
- info->tcpi_snd_cwnd = tp->snd_cwnd;
info->tcpi_advmss = tp->advmss;
- info->tcpi_reordering = tp->reordering;
info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
info->tcpi_rcv_space = tp->rcvq_space.space;
info->tcpi_total_retrans = tp->total_retrans;
- rate = READ_ONCE(sk->sk_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
- put_unaligned(rate64, &info->tcpi_pacing_rate);
+ info->tcpi_bytes_acked = tp->bytes_acked;
+ info->tcpi_bytes_received = tp->bytes_received;
+ info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
+ tcp_get_info_chrono_stats(tp, info);
- rate = READ_ONCE(sk->sk_max_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
- put_unaligned(rate64, &info->tcpi_max_pacing_rate);
-
- do {
- start = u64_stats_fetch_begin_irq(&tp->syncp);
- put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
- put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
- } while (u64_stats_fetch_retry_irq(&tp->syncp, start));
info->tcpi_segs_out = tp->segs_out;
info->tcpi_segs_in = tp->segs_in;
- notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
- info->tcpi_notsent_bytes = max(0, notsent_bytes);
-
info->tcpi_min_rtt = tcp_min_rtt(tp);
info->tcpi_data_segs_in = tp->data_segs_in;
info->tcpi_data_segs_out = tp->data_segs_out;
@@ -2806,11 +2875,36 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
if (rate && intv) {
rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
do_div(rate64, intv);
- put_unaligned(rate64, &info->tcpi_delivery_rate);
+ info->tcpi_delivery_rate = rate64;
}
+ unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
+struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *stats;
+ struct tcp_info info;
+
+ stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
+ if (!stats)
+ return NULL;
+
+ tcp_get_info_chrono_stats(tp, &info);
+ nla_put_u64_64bit(stats, TCP_NLA_BUSY,
+ info.tcpi_busy_time, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
+ info.tcpi_rwnd_limited, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
+ info.tcpi_sndbuf_limited, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
+ tp->data_segs_out, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
+ tp->total_retrans, TCP_NLA_PAD);
+ return stats;
+}
+
static int do_tcp_getsockopt(struct sock *sk, int level,
int optname, char __user *optval, int __user *optlen)
{
@@ -2917,8 +3011,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_THIN_LINEAR_TIMEOUTS:
val = tp->thin_lto;
break;
+
case TCP_THIN_DUPACK:
- val = tp->thin_dupack;
+ val = 0;
break;
case TCP_REPAIR:
@@ -2971,6 +3066,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = icsk->icsk_accept_queue.fastopenq.max_qlen;
break;
+ case TCP_FASTOPEN_CONNECT:
+ val = tp->fastopen_connect;
+ break;
+
case TCP_TIMESTAMP:
val = tcp_time_stamp + tp->tsoffset;
break;
@@ -3284,6 +3383,7 @@ void __init tcp_init(void)
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
+ inet_hashinfo_init(&tcp_hashinfo);
tcp_hashinfo.bind_bucket_cachep =
kmem_cache_create("tcp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,
@@ -3327,10 +3427,7 @@ void __init tcp_init(void)
cnt = tcp_hashinfo.ehash_mask + 1;
-
- tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
sysctl_tcp_max_orphans = cnt / 2;
- sysctl_max_syn_backlog = max(128, cnt / 256);
tcp_init_mem();
/* Set per-socket limits to no more than 1/128 the pressure threshold */
@@ -3349,6 +3446,7 @@ void __init tcp_init(void)
pr_info("Hash tables configured (established %u bind %u)\n",
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
+ tcp_v4_init();
tcp_metrics_init();
BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
tcp_tasklet_init();
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 0ea66c2c9344..b89bce4c721e 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -14,6 +14,36 @@
* observed, or adjust the sending rate if it estimates there is a
* traffic policer, in order to keep the drop rate reasonable.
*
+ * Here is a state transition diagram for BBR:
+ *
+ * |
+ * V
+ * +---> STARTUP ----+
+ * | | |
+ * | V |
+ * | DRAIN ----+
+ * | | |
+ * | V |
+ * +---> PROBE_BW ----+
+ * | ^ | |
+ * | | | |
+ * | +----+ |
+ * | |
+ * +---- PROBE_RTT <--+
+ *
+ * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
+ * When it estimates the pipe is full, it enters DRAIN to drain the queue.
+ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
+ * A long-lived BBR flow spends the vast majority of its time remaining
+ * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
+ * in a fair manner, with a small, bounded queue. *If* a flow has been
+ * continuously sending for the entire min_rtt window, and hasn't seen an RTT
+ * sample that matches or decreases its min_rtt estimate for 10 seconds, then
+ * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
+ * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
+ * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
+ * otherwise we enter STARTUP to try to fill the pipe.
+ *
* BBR is described in detail in:
* "BBR: Congestion-Based Congestion Control",
* Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
@@ -51,7 +81,7 @@ enum bbr_mode {
BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */
BBR_DRAIN, /* drain any queue created during startup */
BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */
- BBR_PROBE_RTT, /* cut cwnd to min to probe min_rtt */
+ BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */
};
/* BBR congestion control block */
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 35b280361cb2..50a0f3e51d5b 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -27,6 +27,8 @@
#include <linux/kernel.h>
#include <linux/random.h>
#include <linux/module.h>
+#include <linux/sched/clock.h>
+
#include <net/tcp.h>
#define HYSTART_ACK_TRAIN 1
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index f9038d6b109e..79c4817abc94 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -68,8 +68,9 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
{
int ret = 0;
- /* all algorithms must implement ssthresh and cong_avoid ops */
- if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) {
+ /* all algorithms must implement these */
+ if (!ca->ssthresh || !ca->undo_cwnd ||
+ !(ca->cong_avoid || ca->cong_control)) {
pr_err("%s does not implement required ops\n", ca->name);
return -EINVAL;
}
@@ -443,10 +444,19 @@ u32 tcp_reno_ssthresh(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
+u32 tcp_reno_undo_cwnd(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ return max(tp->snd_cwnd, tp->snd_ssthresh << 1);
+}
+EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
+
struct tcp_congestion_ops tcp_reno = {
.flags = TCP_CONG_NON_RESTRICTED,
.name = "reno",
.owner = THIS_MODULE,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
+ .undo_cwnd = tcp_reno_undo_cwnd,
};
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index ab37c6775630..5f5e5936760e 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -335,6 +335,7 @@ static struct tcp_congestion_ops dctcp __read_mostly = {
static struct tcp_congestion_ops dctcp_reno __read_mostly = {
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.get_info = dctcp_get_info,
.owner = THIS_MODULE,
.name = "dctcp-reno",
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 4e777a3243f9..8ea4e9787f82 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -113,7 +113,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
struct tcp_fastopen_cookie tmp;
if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
- struct in6_addr *buf = (struct in6_addr *) tmp.val;
+ struct in6_addr *buf = &tmp.addr;
int i;
for (i = 0; i < 4; i++)
@@ -205,6 +205,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
* scaled. So correct it appropriately.
*/
tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
+ tp->max_window = tp->snd_wnd;
/* Activate the retrans timer so that SYNACK can be retransmitted.
* The request socket is not added to the ehash
@@ -325,3 +326,57 @@ fastopen:
*foc = valid_foc;
return NULL;
}
+
+bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
+ struct tcp_fastopen_cookie *cookie)
+{
+ unsigned long last_syn_loss = 0;
+ int syn_loss = 0;
+
+ tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
+
+ /* Recurring FO SYN losses: no cookie or data in SYN */
+ if (syn_loss > 1 &&
+ time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
+ cookie->len = -1;
+ return false;
+ }
+ if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
+ cookie->len = -1;
+ return true;
+ }
+ return cookie->len > 0;
+}
+
+/* This function checks if we want to defer sending SYN until the first
+ * write(). We defer under the following conditions:
+ * 1. fastopen_connect sockopt is set
+ * 2. we have a valid cookie
+ * Return value: return true if we want to defer until application writes data
+ * return false if we want to send out SYN immediately
+ */
+bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
+{
+ struct tcp_fastopen_cookie cookie = { .len = 0 };
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 mss;
+
+ if (tp->fastopen_connect && !tp->fastopen_req) {
+ if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
+ inet_sk(sk)->defer_connect = 1;
+ return true;
+ }
+
+ /* Alloc fastopen_req in order for FO option to be included
+ * in SYN
+ */
+ tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
+ sk->sk_allocation);
+ if (tp->fastopen_req)
+ tp->fastopen_req->cookie = cookie;
+ else
+ *err = -ENOBUFS;
+ }
+ return false;
+}
+EXPORT_SYMBOL(tcp_fastopen_defer_connect);
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index db7842495a64..6d9879e93648 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -94,6 +94,7 @@ static const struct hstcp_aimd_val {
struct hstcp {
u32 ai;
+ u32 loss_cwnd;
};
static void hstcp_init(struct sock *sk)
@@ -150,16 +151,24 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
static u32 hstcp_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- const struct hstcp *ca = inet_csk_ca(sk);
+ struct hstcp *ca = inet_csk_ca(sk);
+ ca->loss_cwnd = tp->snd_cwnd;
/* Do multiplicative decrease */
return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
}
+static u32 hstcp_cwnd_undo(struct sock *sk)
+{
+ const struct hstcp *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
.init = hstcp_init,
.ssthresh = hstcp_ssthresh,
+ .undo_cwnd = hstcp_cwnd_undo,
.cong_avoid = hstcp_cong_avoid,
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 083831e359df..0f7175c3338e 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -166,6 +166,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
static struct tcp_congestion_ops tcp_hybla __read_mostly = {
.init = hybla_init,
.ssthresh = tcp_reno_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = hybla_cong_avoid,
.set_state = hybla_state,
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index c8e6d86be114..60352ff4f5a8 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -48,6 +48,7 @@ struct illinois {
u32 end_seq; /* right edge of current RTT */
u32 alpha; /* Additive increase */
u32 beta; /* Muliplicative decrease */
+ u32 loss_cwnd; /* cwnd on loss */
u16 acked; /* # packets acked by current ACK */
u8 rtt_above; /* average rtt has gone above threshold */
u8 rtt_low; /* # of rtts measurements below threshold */
@@ -296,10 +297,18 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct illinois *ca = inet_csk_ca(sk);
+ ca->loss_cwnd = tp->snd_cwnd;
/* Multiplicative decrease */
return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U);
}
+static u32 tcp_illinois_cwnd_undo(struct sock *sk)
+{
+ const struct illinois *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
/* Extract info for Tcp socket info provided via netlink. */
static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
union tcp_cc_info *info)
@@ -327,6 +336,7 @@ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
static struct tcp_congestion_ops tcp_illinois __read_mostly = {
.init = tcp_illinois_init,
.ssthresh = tcp_illinois_ssthresh,
+ .undo_cwnd = tcp_illinois_cwnd_undo,
.cong_avoid = tcp_illinois_cong_avoid,
.set_state = tcp_illinois_state,
.get_info = tcp_illinois_info,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c71d49ce0c93..659d1baefb2b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,12 +79,13 @@
int sysctl_tcp_timestamps __read_mostly = 1;
int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1;
-int sysctl_tcp_fack __read_mostly = 1;
+int sysctl_tcp_fack __read_mostly;
int sysctl_tcp_max_reordering __read_mostly = 300;
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
int sysctl_tcp_adv_win_scale __read_mostly = 1;
EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
+EXPORT_SYMBOL(sysctl_tcp_timestamps);
/* rfc5961 challenge ack rate limiting */
int sysctl_tcp_challenge_ack_limit = 1000;
@@ -94,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly = 2;
int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
-
-int sysctl_tcp_thin_dupack __read_mostly;
-
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_early_retrans __read_mostly = 3;
int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
@@ -128,7 +126,8 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define REXMIT_LOST 1 /* retransmit packets marked lost */
#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
-static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb)
+static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
+ unsigned int len)
{
static bool __once __read_mostly;
@@ -139,8 +138,9 @@ static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb)
rcu_read_lock();
dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
- pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
- dev ? dev->name : "Unknown driver");
+ if (!dev || len >= dev->mtu)
+ pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
+ dev ? dev->name : "Unknown driver");
rcu_read_unlock();
}
}
@@ -163,8 +163,10 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
if (len >= icsk->icsk_ack.rcv_mss) {
icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
tcp_sk(sk)->advmss);
- if (unlikely(icsk->icsk_ack.rcv_mss != len))
- tcp_gro_dev_warn(sk, skb);
+ /* Account for possibly-removed options */
+ if (unlikely(len > icsk->icsk_ack.rcv_mss +
+ MAX_TCP_OPTION_SPACE))
+ tcp_gro_dev_warn(sk, skb, len);
} else {
/* Otherwise, we make more careful check taking into account,
* that SACKs block is variable.
@@ -876,22 +878,11 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
const int ts)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (metric > tp->reordering) {
- int mib_idx;
+ int mib_idx;
+ if (metric > tp->reordering) {
tp->reordering = min(sysctl_tcp_max_reordering, metric);
- /* This exciting event is worth to be remembered. 8) */
- if (ts)
- mib_idx = LINUX_MIB_TCPTSREORDER;
- else if (tcp_is_reno(tp))
- mib_idx = LINUX_MIB_TCPRENOREORDER;
- else if (tcp_is_fack(tp))
- mib_idx = LINUX_MIB_TCPFACKREORDER;
- else
- mib_idx = LINUX_MIB_TCPSACKREORDER;
-
- NET_INC_STATS(sock_net(sk), mib_idx);
#if FASTRETRANS_DEBUG > 1
pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
@@ -903,9 +894,19 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
tcp_disable_fack(tp);
}
- if (metric > 0)
- tcp_disable_early_retrans(tp);
tp->rack.reord = 1;
+
+ /* This exciting event is worth to be remembered. 8) */
+ if (ts)
+ mib_idx = LINUX_MIB_TCPTSREORDER;
+ else if (tcp_is_reno(tp))
+ mib_idx = LINUX_MIB_TCPRENOREORDER;
+ else if (tcp_is_fack(tp))
+ mib_idx = LINUX_MIB_TCPFACKREORDER;
+ else
+ mib_idx = LINUX_MIB_TCPSACKREORDER;
+
+ NET_INC_STATS(sock_net(sk), mib_idx);
}
/* This must be called before lost_out is incremented */
@@ -915,10 +916,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
before(TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
tp->retransmit_skb_hint = skb;
-
- if (!tp->lost_out ||
- after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
- tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
}
/* Sum the number of packets on the wire we have marked as lost.
@@ -1134,6 +1131,7 @@ struct tcp_sacktag_state {
*/
struct skb_mstamp first_sackt;
struct skb_mstamp last_sackt;
+ struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */
struct rate_sample *rate;
int flag;
};
@@ -1216,7 +1214,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked;
if (!(sacked & TCPCB_SACKED_ACKED)) {
- tcp_rack_advance(tp, xmit_time, sacked);
+ tcp_rack_advance(tp, sacked, end_seq,
+ xmit_time, &state->ack_time);
if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
@@ -1981,7 +1980,6 @@ void tcp_enter_loss(struct sock *sk)
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
- tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
}
}
tcp_verify_left_out(tp);
@@ -2000,6 +1998,11 @@ void tcp_enter_loss(struct sock *sk)
/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
* loss recovery is underway except recurring timeout(s) on
* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+ *
+ * In theory F-RTO can be used repeatedly during loss recovery.
+ * In practice this interacts badly with broken middle-boxes that
+ * falsely raise the receive window, which results in repeated
+ * timeouts and stop-and-go behavior.
*/
tp->frto = sysctl_tcp_frto &&
(new_recovery || icsk->icsk_retransmits) &&
@@ -2055,30 +2058,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
}
-static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- unsigned long delay;
-
- /* Delay early retransmit and entering fast recovery for
- * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
- * available, or RTO is scheduled to fire first.
- */
- if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
- (flag & FLAG_ECE) || !tp->srtt_us)
- return false;
-
- delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
- msecs_to_jiffies(2));
-
- if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
- return false;
-
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
- TCP_RTO_MAX);
- return true;
-}
-
/* Linux NewReno/SACK/FACK/ECN state machine.
* --------------------------------------
*
@@ -2126,10 +2105,26 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
* F.e. after RTO, when all the queue is considered as lost,
* lost_out = packets_out and in_flight = retrans_out.
*
- * Essentially, we have now two algorithms counting
+ * Essentially, we have now a few algorithms detecting
* lost packets.
*
- * FACK: It is the simplest heuristics. As soon as we decided
+ * If the receiver supports SACK:
+ *
+ * RFC6675/3517: It is the conventional algorithm. A packet is
+ * considered lost if the number of higher sequence packets
+ * SACKed is greater than or equal the DUPACK thoreshold
+ * (reordering). This is implemented in tcp_mark_head_lost and
+ * tcp_update_scoreboard.
+ *
+ * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
+ * (2017-) that checks timing instead of counting DUPACKs.
+ * Essentially a packet is considered lost if it's not S/ACKed
+ * after RTT + reordering_window, where both metrics are
+ * dynamically measured and adjusted. This is implemented in
+ * tcp_rack_mark_lost.
+ *
+ * FACK (Disabled by default. Subsumbed by RACK):
+ * It is the simplest heuristics. As soon as we decided
* that something is lost, we decide that _all_ not SACKed
* packets until the most forward SACK are lost. I.e.
* lost_out = fackets_out - sacked_out and left_out = fackets_out.
@@ -2138,16 +2133,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
* takes place. We use FACK by default until reordering
* is suspected on the path to this destination.
*
- * NewReno: when Recovery is entered, we assume that one segment
+ * If the receiver does not support SACK:
+ *
+ * NewReno (RFC6582): in Recovery we assume that one segment
* is lost (classic Reno). While we are in Recovery and
* a partial ACK arrives, we assume that one more packet
* is lost (NewReno). This heuristics are the same in NewReno
* and SACK.
*
- * Imagine, that's all! Forget about all this shamanism about CWND inflation
- * deflation etc. CWND is real congestion window, never inflated, changes
- * only according to classic VJ rules.
- *
* Really tricky (and requiring careful tuning) part of algorithm
* is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
* The first determines the moment _when_ we should reduce CWND and,
@@ -2175,8 +2168,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
static bool tcp_time_to_recover(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
- __u32 packets_out;
- int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
/* Trick#1: The loss is proven. */
if (tp->lost_out)
@@ -2186,39 +2177,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
if (tcp_dupack_heuristics(tp) > tp->reordering)
return true;
- /* Trick#4: It is still not OK... But will it be useful to delay
- * recovery more?
- */
- packets_out = tp->packets_out;
- if (packets_out <= tp->reordering &&
- tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
- !tcp_may_send_now(sk)) {
- /* We have nothing to send. This connection is limited
- * either by receiver window or by application.
- */
- return true;
- }
-
- /* If a thin stream is detected, retransmit after first
- * received dupack. Employ only if SACK is supported in order
- * to avoid possible corner-case series of spurious retransmissions
- * Use only if there are no unsent data.
- */
- if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
- tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
- tcp_is_sack(tp) && !tcp_send_head(sk))
- return true;
-
- /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
- * retransmissions due to small network reorderings, we implement
- * Mitigation A.3 in the RFC and delay the retransmission for a short
- * interval if appropriate.
- */
- if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
- (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
- !tcp_may_send_now(sk))
- return !tcp_pause_early_retransmit(sk, flag);
-
return false;
}
@@ -2414,10 +2372,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
if (tp->prior_ssthresh) {
const struct inet_connection_sock *icsk = inet_csk(sk);
- if (icsk->icsk_ca_ops->undo_cwnd)
- tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
- else
- tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
+ tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
if (tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
@@ -2523,8 +2478,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
tcp_ecn_queue_cwr(tp);
}
-static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
- int flag)
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
@@ -2692,7 +2646,7 @@ void tcp_simple_retransmit(struct sock *sk)
}
EXPORT_SYMBOL(tcp_simple_retransmit);
-static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
+void tcp_enter_recovery(struct sock *sk, bool ece_ack)
{
struct tcp_sock *tp = tcp_sk(sk);
int mib_idx;
@@ -2728,14 +2682,18 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
tcp_try_undo_loss(sk, false))
return;
- if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
- /* Step 3.b. A timeout is spurious if not all data are
- * lost, i.e., never-retransmitted data are (s)acked.
- */
- if ((flag & FLAG_ORIG_SACK_ACKED) &&
- tcp_try_undo_loss(sk, true))
- return;
+ /* The ACK (s)acks some never-retransmitted data meaning not all
+ * the data packets before the timeout were lost. Therefore we
+ * undo the congestion window and state. This is essentially
+ * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since
+ * a retransmitted skb is permantly marked, we can apply such an
+ * operation even if F-RTO was not used.
+ */
+ if ((flag & FLAG_ORIG_SACK_ACKED) &&
+ tcp_try_undo_loss(sk, tp->undo_marker))
+ return;
+ if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
if (after(tp->snd_nxt, tp->high_seq)) {
if (flag & FLAG_DATA_SACKED || is_dupack)
tp->frto = 0; /* Step 3.a. loss was real */
@@ -2802,6 +2760,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
return false;
}
+static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
+ const struct skb_mstamp *ack_time)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Use RACK to detect loss */
+ if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
+ u32 prior_retrans = tp->retrans_out;
+
+ tcp_rack_mark_lost(sk, ack_time);
+ if (prior_retrans > tp->retrans_out)
+ *ack_flag |= FLAG_LOST_RETRANS;
+ }
+}
+
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
@@ -2815,7 +2788,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
* tcp_xmit_retransmit_queue().
*/
static void tcp_fastretrans_alert(struct sock *sk, const int acked,
- bool is_dupack, int *ack_flag, int *rexmit)
+ bool is_dupack, int *ack_flag, int *rexmit,
+ const struct skb_mstamp *ack_time)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -2866,13 +2840,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
}
}
- /* Use RACK to detect loss */
- if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
- tcp_rack_mark_lost(sk)) {
- flag |= FLAG_LOST_RETRANS;
- *ack_flag |= FLAG_LOST_RETRANS;
- }
-
/* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
@@ -2890,11 +2857,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
tcp_try_keep_open(sk);
return;
}
+ tcp_rack_identify_loss(sk, ack_flag, ack_time);
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack, rexmit);
- if (icsk->icsk_ca_state != TCP_CA_Open &&
- !(flag & FLAG_LOST_RETRANS))
+ tcp_rack_identify_loss(sk, ack_flag, ack_time);
+ if (!(icsk->icsk_ca_state == TCP_CA_Open ||
+ (*ack_flag & FLAG_LOST_RETRANS)))
return;
/* Change state if cwnd is undone or retransmits are lost */
default:
@@ -2908,6 +2877,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
+ tcp_rack_identify_loss(sk, ack_flag, ack_time);
if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag);
return;
@@ -3026,7 +2996,7 @@ void tcp_rearm_rto(struct sock *sk)
} else {
u32 rto = inet_csk(sk)->icsk_rto;
/* Offset the time elapsed after installing regular RTO */
- if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
struct sk_buff *skb = tcp_write_queue_head(sk);
const u32 rto_time_stamp =
@@ -3043,24 +3013,6 @@ void tcp_rearm_rto(struct sock *sk)
}
}
-/* This function is called when the delayed ER timer fires. TCP enters
- * fast recovery and performs fast-retransmit.
- */
-void tcp_resume_early_retransmit(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tcp_rearm_rto(sk);
-
- /* Stop if ER is disabled after the delayed ER timer is scheduled */
- if (!tp->do_early_retrans)
- return;
-
- tcp_enter_recovery(sk, false);
- tcp_update_scoreboard(sk, 1);
- tcp_xmit_retransmit_queue(sk);
-}
-
/* If we get here, the whole TSO packet has not been acked. */
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
{
@@ -3103,11 +3055,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_snd_una, int *acked,
- struct tcp_sacktag_state *sack,
- struct skb_mstamp *now)
+ struct tcp_sacktag_state *sack)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct skb_mstamp first_ackt, last_ackt;
+ struct skb_mstamp *now = &sack->ack_time;
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
@@ -3167,7 +3119,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
} else if (tcp_is_sack(tp)) {
tp->delivered += acked_pcount;
if (!tcp_skb_spurious_retrans(tp, skb))
- tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
+ tcp_rack_advance(tp, sacked, scb->end_seq,
+ &skb->skb_mstamp,
+ &sack->ack_time);
}
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
@@ -3201,6 +3155,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->lost_skb_hint = NULL;
}
+ if (!skb)
+ tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+
if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
tp->snd_up = tp->snd_una;
@@ -3371,9 +3328,7 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
u32 delta = ack - tp->snd_una;
sock_owned_by_me((struct sock *)tp);
- u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_acked += delta;
- u64_stats_update_end_raw(&tp->syncp);
tp->snd_una = ack;
}
@@ -3383,9 +3338,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
u32 delta = seq - tp->rcv_nxt;
sock_owned_by_me((struct sock *)tp);
- u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_received += delta;
- u64_stats_update_end_raw(&tp->syncp);
tp->rcv_nxt = seq;
}
@@ -3598,7 +3551,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 lost = tp->lost;
int acked = 0; /* Number of packets newly acked */
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
- struct skb_mstamp now;
sack_state.first_sackt.v64 = 0;
sack_state.rate = &rs;
@@ -3624,10 +3576,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
- skb_mstamp_get(&now);
+ skb_mstamp_get(&sack_state.ack_time);
- if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
+ if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
if (after(ack, prior_snd_una)) {
@@ -3692,34 +3643,34 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
- &sack_state, &now);
+ &sack_state);
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+ &sack_state.ack_time);
}
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
- if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
- struct dst_entry *dst = __sk_dst_get(sk);
- if (dst)
- dst_confirm(dst);
- }
+ if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
+ sk_dst_confirm(sk);
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
lost = tp->lost - lost; /* freshly marked lost */
- tcp_rate_gen(sk, delivered, lost, &now, &rs);
- tcp_cong_control(sk, ack, delivered, flag, &rs);
+ tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time,
+ sack_state.rate);
+ tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
tcp_xmit_recovery(sk, rexmit);
return 1;
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK)
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+ &sack_state.ack_time);
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
@@ -3740,9 +3691,11 @@ old_ack:
* If data was DSACKed, see if we can undo a cwnd reduction.
*/
if (TCP_SKB_CB(skb)->sacked) {
+ skb_mstamp_get(&sack_state.ack_time);
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+ &sack_state.ack_time);
tcp_xmit_recovery(sk, rexmit);
}
@@ -4560,6 +4513,7 @@ add_sack:
end:
if (skb) {
tcp_grow_window(sk, skb);
+ skb_condense(skb);
skb_set_owner_r(skb, sk);
}
}
@@ -5081,10 +5035,13 @@ static void tcp_check_space(struct sock *sk)
if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
/* pairs with tcp_poll() */
- smp_mb__after_atomic();
+ smp_mb();
if (sk->sk_socket &&
- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
tcp_new_space(sk);
+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+ }
}
}
@@ -5249,6 +5206,23 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
return err;
}
+/* Accept RST for rcv_nxt - 1 after a FIN.
+ * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
+ * FIN is sent followed by a RST packet. The RST is sent with the same
+ * sequence number as the FIN, and thus according to RFC 5961 a challenge
+ * ACK should be sent. However, Mac OSX rate limits replies to challenge
+ * ACKs on the closed socket. In addition middleboxes can drop either the
+ * challenge ACK or a subsequent RST.
+ */
+static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
+ (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
+ TCPF_CLOSING));
+}
+
/* Does PAWS and seqno based validation of an incoming segment, flags will
* play significant role here.
*/
@@ -5287,20 +5261,25 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
LINUX_MIB_TCPACKSKIPPEDSEQ,
&tp->last_oow_ack_time))
tcp_send_dupack(sk, skb);
+ } else if (tcp_reset_check(sk, skb)) {
+ tcp_reset(sk);
}
goto discard;
}
/* Step 2: check RST bit */
if (th->rst) {
- /* RFC 5961 3.2 (extend to match against SACK too if available):
- * If seq num matches RCV.NXT or the right-most SACK block,
+ /* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
+ * FIN and SACK too if available):
+ * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
+ * the right-most SACK block,
* then
* RESET the connection
* else
* Send a challenge ACK
*/
- if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
+ tcp_reset_check(sk, skb)) {
rst_seq_match = true;
} else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
struct tcp_sack_block *sp = &tp->selective_acks[0];
@@ -5571,6 +5550,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_set_state(sk, TCP_ESTABLISHED);
+ icsk->icsk_ack.lrcvtime = tcp_time_stamp;
if (skb) {
icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
@@ -5789,7 +5769,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* to stand against the temptation 8) --ANK
*/
inet_csk_schedule_ack(sk);
- icsk->icsk_ack.lrcvtime = tcp_time_stamp;
tcp_enter_quickack_mode(sk);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX, TCP_RTO_MAX);
@@ -5916,9 +5895,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (th->syn) {
if (th->fin)
goto discard;
- if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
- return 1;
+ /* It is possible that we process SYN packets from backlog,
+ * so we need to make sure to disable BH right there.
+ */
+ local_bh_disable();
+ acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
+ local_bh_enable();
+ if (!acceptable)
+ return 1;
consume_skb(skb);
return 0;
}
@@ -6022,7 +6007,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
break;
case TCP_FIN_WAIT1: {
- struct dst_entry *dst;
int tmo;
/* If we enter the TCP_FIN_WAIT1 state and we are a
@@ -6049,9 +6033,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tcp_set_state(sk, TCP_FIN_WAIT2);
sk->sk_shutdown |= SEND_SHUTDOWN;
- dst = __sk_dst_get(sk);
- if (dst)
- dst_confirm(dst);
+ sk_dst_confirm(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
/* Wake up lingering close() */
@@ -6318,13 +6300,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop;
}
-
- /* Accept backlog is full. If we have already queued enough
- * of warm entries in syn queue, drop request. It is better than
- * clogging syn queue with openreqs with exponentially increasing
- * timeout.
- */
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
+ if (sk_acceptq_is_full(sk)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
goto drop;
}
@@ -6334,6 +6310,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop;
tcp_rsk(req)->af_specific = af_ops;
+ tcp_rsk(req)->ts_off = 0;
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
@@ -6355,6 +6332,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
+ if (isn && tmp_opt.tstamp_ok)
+ af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
+
if (!want_cookie && !isn) {
/* VJ's idea. We save last timestamp seen
* from the destination in peer table, when entering
@@ -6365,7 +6345,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
* timewait bucket, so that all the necessary checks
* are made in the function processing timewait state.
*/
- if (tcp_death_row.sysctl_tw_recycle) {
+ if (net->ipv4.tcp_death_row.sysctl_tw_recycle) {
bool strict;
dst = af_ops->route_req(sk, &fl, req, &strict);
@@ -6379,8 +6359,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
}
/* Kill the following clause, if you dislike this way. */
else if (!net->ipv4.sysctl_tcp_syncookies &&
- (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
- (sysctl_max_syn_backlog >> 2)) &&
+ (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+ (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
!tcp_peer_is_proven(req, dst, false,
tmp_opt.saw_tstamp)) {
/* Without syncookies last quarter of
@@ -6395,7 +6375,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop_and_release;
}
- isn = af_ops->init_seq(skb);
+ isn = af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
}
if (!dst) {
dst = af_ops->route_req(sk, &fl, req, NULL);
@@ -6407,6 +6387,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (want_cookie) {
isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+ tcp_rsk(req)->ts_off = 0;
req->cookie_ts = tmp_opt.tstamp_ok;
if (!tmp_opt.tstamp_ok)
inet_rsk(req)->ecn_ok = 0;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2259114c7242..575e19dcc017 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -84,7 +84,6 @@
#include <crypto/hash.h>
#include <linux/scatterlist.h>
-int sysctl_tcp_tw_reuse __read_mostly;
int sysctl_tcp_low_latency __read_mostly;
#ifdef CONFIG_TCP_MD5SIG
@@ -95,12 +94,12 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);
-static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
+static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
{
return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr,
tcp_hdr(skb)->dest,
- tcp_hdr(skb)->source);
+ tcp_hdr(skb)->source, tsoff);
}
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
@@ -120,7 +119,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
and use initial timestamp retrieved from peer table.
*/
if (tcptw->tw_ts_recent_stamp &&
- (!twp || (sysctl_tcp_tw_reuse &&
+ (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
if (tp->write_seq == 0)
@@ -146,7 +145,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
struct flowi4 *fl4;
struct rtable *rt;
int err;
+ u32 seq;
struct ip_options_rcu *inet_opt;
+ struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
@@ -197,7 +198,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
tp->write_seq = 0;
}
- if (tcp_death_row.sysctl_tw_recycle &&
+ if (tcp_death_row->sysctl_tw_recycle &&
!tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
tcp_fetch_timewait_stamp(sk, &rt->dst);
@@ -216,7 +217,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
* complete initialization after this.
*/
tcp_set_state(sk, TCP_SYN_SENT);
- err = inet_hash_connect(&tcp_death_row, sk);
+ err = inet_hash_connect(tcp_death_row, sk);
if (err)
goto failure;
@@ -232,18 +233,27 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
/* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst);
+ rt = NULL;
- if (!tp->write_seq && likely(!tp->repair))
- tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
- inet->inet_daddr,
- inet->inet_sport,
- usin->sin_port);
+ if (likely(!tp->repair)) {
+ seq = secure_tcp_sequence_number(inet->inet_saddr,
+ inet->inet_daddr,
+ inet->inet_sport,
+ usin->sin_port,
+ &tp->tsoffset);
+ if (!tp->write_seq)
+ tp->write_seq = seq;
+ }
inet->inet_id = tp->write_seq ^ jiffies;
+ if (tcp_fastopen_defer_connect(sk, &err))
+ return err;
+ if (err)
+ goto failure;
+
err = tcp_connect(sk);
- rt = NULL;
if (err)
goto failure;
@@ -269,10 +279,13 @@ EXPORT_SYMBOL(tcp_v4_connect);
*/
void tcp_v4_mtu_reduced(struct sock *sk)
{
- struct dst_entry *dst;
struct inet_sock *inet = inet_sk(sk);
- u32 mtu = tcp_sk(sk)->mtu_info;
+ struct dst_entry *dst;
+ u32 mtu;
+ if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
+ return;
+ mtu = tcp_sk(sk)->mtu_info;
dst = inet_csk_update_pmtu(sk, mtu);
if (!dst)
return;
@@ -418,7 +431,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
switch (type) {
case ICMP_REDIRECT:
- do_redirect(icmp_skb, sk);
+ if (!sock_owned_by_user(sk))
+ do_redirect(icmp_skb, sk);
goto out;
case ICMP_SOURCE_QUENCH:
/* Just silently ignore these. */
@@ -442,7 +456,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
if (!sock_owned_by_user(sk)) {
tcp_v4_mtu_reduced(sk);
} else {
- if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
+ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
goto out;
@@ -691,6 +705,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
offsetof(struct inet_timewait_sock, tw_bound_dev_if));
arg.tos = ip_hdr(skb)->tos;
+ arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -711,7 +726,7 @@ out:
outside socket context is ugly, certainly. What can I do?
*/
-static void tcp_v4_send_ack(struct net *net,
+static void tcp_v4_send_ack(const struct sock *sk,
struct sk_buff *skb, u32 seq, u32 ack,
u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
@@ -726,6 +741,7 @@ static void tcp_v4_send_ack(struct net *net,
#endif
];
} rep;
+ struct net *net = sock_net(sk);
struct ip_reply_arg arg;
memset(&rep.th, 0, sizeof(struct tcphdr));
@@ -775,6 +791,7 @@ static void tcp_v4_send_ack(struct net *net,
if (oif)
arg.bound_dev_if = oif;
arg.tos = tos;
+ arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -790,7 +807,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
- tcp_v4_send_ack(sock_net(sk), skb,
+ tcp_v4_send_ack(sk, skb,
tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp + tcptw->tw_ts_offset,
@@ -818,10 +835,10 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
* exception of <SYN> segments, MUST be right-shifted by
* Rcv.Wind.Shift bits:
*/
- tcp_v4_send_ack(sock_net(sk), skb, seq,
+ tcp_v4_send_ack(sk, skb, seq,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
- tcp_time_stamp,
+ tcp_time_stamp + tcp_rsk(req)->ts_off,
req->ts_recent,
0,
tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
@@ -1315,10 +1332,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
tcp_ca_openreq_child(newsk, dst);
tcp_sync_mss(newsk, dst_mtu(dst));
- newtp->advmss = dst_metric_advmss(dst);
- if (tcp_sk(sk)->rx_opt.user_mss &&
- tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
- newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+ newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
tcp_initialize_rcv_mss(newsk);
@@ -1552,8 +1566,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
* It has been noticed pure SACK packets were sometimes dropped
* (if cooked by drivers without copybreak feature).
*/
- if (!skb->data_len)
- skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+ skb_condense(skb);
if (unlikely(sk_add_backlog(sk, skb, limit))) {
bh_unlock_sock(sk);
@@ -1813,7 +1826,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
.getsockopt = ip_getsockopt,
.addr2sockaddr = inet_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in),
- .bind_conflict = inet_csk_bind_conflict,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ip_setsockopt,
.compat_getsockopt = compat_ip_getsockopt,
@@ -1884,9 +1896,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_free_fastopen_req(tp);
tcp_saved_syn_free(tp);
- local_bh_disable();
sk_sockets_allocated_dec(sk);
- local_bh_enable();
}
EXPORT_SYMBOL(tcp_v4_destroy_sock);
@@ -1908,7 +1918,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
if (!sk) {
get_head:
ilb = &tcp_hashinfo.listening_hash[st->bucket];
- spin_lock_bh(&ilb->lock);
+ spin_lock(&ilb->lock);
sk = sk_head(&ilb->head);
st->offset = 0;
goto get_sk;
@@ -1925,7 +1935,7 @@ get_sk:
if (sk->sk_family == st->family)
return sk;
}
- spin_unlock_bh(&ilb->lock);
+ spin_unlock(&ilb->lock);
st->offset = 0;
if (++st->bucket < INET_LHTABLE_SIZE)
goto get_head;
@@ -2133,7 +2143,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
switch (st->state) {
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
- spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
+ spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
break;
case TCP_SEQ_STATE_ESTABLISHED:
if (v)
@@ -2225,7 +2235,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
int state;
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
timer_expires = icsk->icsk_timeout;
@@ -2372,6 +2382,7 @@ struct proto tcp_prot = {
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
+ .keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
@@ -2415,7 +2426,7 @@ static void __net_exit tcp_sk_exit(struct net *net)
static int __net_init tcp_sk_init(struct net *net)
{
- int res, cpu;
+ int res, cpu, cnt;
net->ipv4.tcp_sk = alloc_percpu(struct sock *);
if (!net->ipv4.tcp_sk)
@@ -2452,6 +2463,14 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_orphan_retries = 0;
net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
+ net->ipv4.sysctl_tcp_tw_reuse = 0;
+
+ cnt = tcp_hashinfo.ehash_mask + 1;
+ net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
+ net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
+ net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
+
+ net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
return 0;
fail:
@@ -2462,7 +2481,7 @@ fail:
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
{
- inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
+ inet_twsk_purge(&tcp_hashinfo, AF_INET);
}
static struct pernet_operations __net_initdata tcp_sk_ops = {
@@ -2473,7 +2492,6 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
void __init tcp_v4_init(void)
{
- inet_hashinfo_init(&tcp_hashinfo);
if (register_pernet_subsys(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n");
}
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index c67ece1390c2..046fd3910873 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -316,6 +316,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
static struct tcp_congestion_ops tcp_lp __read_mostly = {
.init = tcp_lp_init,
.ssthresh = tcp_reno_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_lp_cong_avoid,
.pkts_acked = tcp_lp_pkts_acked,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index bf1f3b2b29d1..0f46e5fe31ad 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -375,12 +375,10 @@ void tcp_update_metrics(struct sock *sk)
u32 val;
int m;
+ sk_dst_confirm(sk);
if (sysctl_tcp_nometrics_save || !dst)
return;
- if (dst->flags & DST_HOST)
- dst_confirm(dst);
-
rcu_read_lock();
if (icsk->icsk_backoff || !tp->srtt_us) {
/* This session failed to estimate rtt. Why?
@@ -493,11 +491,10 @@ void tcp_init_metrics(struct sock *sk)
struct tcp_metrics_block *tm;
u32 val, crtt = 0; /* cached RTT scaled by 8 */
+ sk_dst_confirm(sk);
if (!dst)
goto reset;
- dst_confirm(dst);
-
rcu_read_lock();
tm = tcp_get_metrics(sk, dst, true);
if (!tm) {
@@ -522,7 +519,6 @@ void tcp_init_metrics(struct sock *sk)
val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
if (val && tp->reordering != val) {
tcp_disable_fack(tp);
- tcp_disable_early_retrans(tp);
tp->reordering = val;
}
@@ -606,7 +602,6 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
return ret;
}
-EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
{
@@ -742,14 +737,7 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
rcu_read_unlock();
}
-static struct genl_family tcp_metrics_nl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = TCP_METRICS_GENL_NAME,
- .version = TCP_METRICS_GENL_VERSION,
- .maxattr = TCP_METRICS_ATTR_MAX,
- .netnsok = true,
-};
+static struct genl_family tcp_metrics_nl_family;
static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
[TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, },
@@ -1116,6 +1104,17 @@ static const struct genl_ops tcp_metrics_nl_ops[] = {
},
};
+static struct genl_family tcp_metrics_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = TCP_METRICS_GENL_NAME,
+ .version = TCP_METRICS_GENL_VERSION,
+ .maxattr = TCP_METRICS_ATTR_MAX,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = tcp_metrics_nl_ops,
+ .n_ops = ARRAY_SIZE(tcp_metrics_nl_ops),
+};
+
static unsigned int tcpmhash_entries;
static int __init set_tcpmhash_entries(char *str)
{
@@ -1179,8 +1178,7 @@ void __init tcp_metrics_init(void)
if (ret < 0)
panic("Could not allocate the tcp_metrics hash table\n");
- ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
- tcp_metrics_nl_ops);
+ ret = genl_register_family(&tcp_metrics_nl_family);
if (ret < 0)
panic("Could not register tcp_metrics generic netlink\n");
}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6234ebaa7db1..65c0f3d13eca 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -29,12 +29,6 @@
int sysctl_tcp_abort_on_overflow __read_mostly;
-struct inet_timewait_death_row tcp_death_row = {
- .sysctl_max_tw_buckets = NR_FILE * 2,
- .hashinfo = &tcp_hashinfo,
-};
-EXPORT_SYMBOL_GPL(tcp_death_row);
-
static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
if (seq == s_win)
@@ -100,13 +94,15 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
struct tcp_options_received tmp_opt;
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
bool paws_reject = false;
+ struct inet_timewait_death_row *tcp_death_row = &sock_net((struct sock*)tw)->ipv4.tcp_death_row;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
tcp_parse_options(skb, &tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
- tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
+ if (tmp_opt.rcv_tsecr)
+ tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
tmp_opt.ts_recent = tcptw->tw_ts_recent;
tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
@@ -153,7 +149,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
}
- if (tcp_death_row.sysctl_tw_recycle &&
+ if (tcp_death_row->sysctl_tw_recycle &&
tcptw->tw_ts_recent_stamp &&
tcp_tw_remember_stamp(tw))
inet_twsk_reschedule(tw, tw->tw_timeout);
@@ -264,11 +260,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
const struct tcp_sock *tp = tcp_sk(sk);
struct inet_timewait_sock *tw;
bool recycle_ok = false;
+ struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
- if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
+ if (tcp_death_row->sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
recycle_ok = tcp_remember_stamp(sk);
- tw = inet_twsk_alloc(sk, &tcp_death_row, state);
+ tw = inet_twsk_alloc(sk, tcp_death_row, state);
if (tw) {
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
@@ -364,15 +361,12 @@ void tcp_openreq_init_rwin(struct request_sock *req,
{
struct inet_request_sock *ireq = inet_rsk(req);
const struct tcp_sock *tp = tcp_sk(sk_listener);
- u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);
int full_space = tcp_full_space(sk_listener);
- int mss = dst_metric_advmss(dst);
u32 window_clamp;
__u8 rcv_wscale;
+ int mss;
- if (user_mss && user_mss < mss)
- mss = user_mss;
-
+ mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
window_clamp = READ_ONCE(tp->window_clamp);
/* Set this up on the first call only */
req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
@@ -466,13 +460,13 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U);
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
+ newicsk->icsk_ack.lrcvtime = tcp_time_stamp;
newtp->packets_out = 0;
newtp->retrans_out = 0;
newtp->sacked_out = 0;
newtp->fackets_out = 0;
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
- tcp_enable_early_retrans(newtp);
newtp->tlp_high_seq = 0;
newtp->lsndtime = treq->snt_synack.stamp_jiffies;
newsk->sk_txhash = treq->txhash;
@@ -532,7 +526,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->rx_opt.ts_recent_stamp = 0;
newtp->tcp_header_len = sizeof(struct tcphdr);
}
- newtp->tsoffset = 0;
+ newtp->tsoffset = treq->ts_off;
#ifdef CONFIG_TCP_MD5SIG
newtp->md5sig_info = NULL; /*XXX*/
if (newtp->af_specific->md5_lookup(sk, newsk))
@@ -581,6 +575,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
+ if (tmp_opt.rcv_tsecr)
+ tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
/* We do not store true stamp, but it is not required,
* it can be estimated (approximately)
* from another data.
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 896e9dfbdb5c..c3c082ed3879 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -76,16 +76,15 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
tp->packets_out += tcp_skb_pcount(skb);
- if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
- }
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
tcp_skb_pcount(skb));
}
-/* SND.NXT, if window was not shrunk.
+/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
+ * window scaling factor due to loss of precision.
* If window has been shrunk, what should we make? It is not clear at all.
* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
* Anything in between SND.UNA...SND.UNA+SND.WND also can be already
@@ -95,7 +94,9 @@ static inline __u32 tcp_acceptable_seq(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- if (!before(tcp_wnd_end(tp), tp->snd_nxt))
+ if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
+ (tp->rx_opt.wscale_ok &&
+ ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
return tp->snd_nxt;
else
return tcp_wnd_end(tp);
@@ -640,7 +641,7 @@ static unsigned int tcp_synack_options(struct request_sock *req,
}
if (likely(ireq->tstamp_ok)) {
opts->options |= OPTION_TS;
- opts->tsval = tcp_skb_timestamp(skb);
+ opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
opts->tsecr = req->ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
@@ -769,25 +770,27 @@ static void tcp_tasklet_func(unsigned long data)
list_del(&tp->tsq_node);
sk = (struct sock *)tp;
- bh_lock_sock(sk);
-
- if (!sock_owned_by_user(sk)) {
- tcp_tsq_handler(sk);
- } else {
- /* defer the work to tcp_release_cb() */
- set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+ smp_mb__before_atomic();
+ clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
+
+ if (!sk->sk_lock.owned &&
+ test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
+ tcp_tsq_handler(sk);
+ }
+ bh_unlock_sock(sk);
}
- bh_unlock_sock(sk);
- clear_bit(TSQ_QUEUED, &tp->tsq_flags);
sk_free(sk);
}
}
-#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
- (1UL << TCP_WRITE_TIMER_DEFERRED) | \
- (1UL << TCP_DELACK_TIMER_DEFERRED) | \
- (1UL << TCP_MTU_REDUCED_DEFERRED))
+#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
+ TCPF_WRITE_TIMER_DEFERRED | \
+ TCPF_DELACK_TIMER_DEFERRED | \
+ TCPF_MTU_REDUCED_DEFERRED)
/**
* tcp_release_cb - tcp release_sock() callback
* @sk: socket
@@ -797,18 +800,17 @@ static void tcp_tasklet_func(unsigned long data)
*/
void tcp_release_cb(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
unsigned long flags, nflags;
/* perform an atomic operation only if at least one flag is set */
do {
- flags = tp->tsq_flags;
+ flags = sk->sk_tsq_flags;
if (!(flags & TCP_DEFERRED_ALL))
return;
nflags = flags & ~TCP_DEFERRED_ALL;
- } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
+ } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
- if (flags & (1UL << TCP_TSQ_DEFERRED))
+ if (flags & TCPF_TSQ_DEFERRED)
tcp_tsq_handler(sk);
/* Here begins the tricky part :
@@ -822,15 +824,15 @@ void tcp_release_cb(struct sock *sk)
*/
sock_release_ownership(sk);
- if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
+ if (flags & TCPF_WRITE_TIMER_DEFERRED) {
tcp_write_timer_handler(sk);
__sock_put(sk);
}
- if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
+ if (flags & TCPF_DELACK_TIMER_DEFERRED) {
tcp_delack_timer_handler(sk);
__sock_put(sk);
}
- if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
+ if (flags & TCPF_MTU_REDUCED_DEFERRED) {
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk);
}
@@ -860,6 +862,7 @@ void tcp_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long flags, nval, oval;
int wmem;
/* Keep one reference on sk_wmem_alloc.
@@ -877,16 +880,25 @@ void tcp_wfree(struct sk_buff *skb)
if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
goto out;
- if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
- !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
- unsigned long flags;
+ for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
struct tsq_tasklet *tsq;
+ bool empty;
+
+ if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
+ goto out;
+
+ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
+ nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
+ if (nval != oval)
+ continue;
/* queue this socket to tasklet queue */
local_irq_save(flags);
tsq = this_cpu_ptr(&tsq_tasklet);
+ empty = list_empty(&tsq->head);
list_add(&tp->tsq_node, &tsq->head);
- tasklet_schedule(&tsq->tasklet);
+ if (empty)
+ tasklet_schedule(&tsq->tasklet);
local_irq_restore(flags);
return;
}
@@ -955,6 +967,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
*/
skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
+ /* If we had to use memory reserve to allocate this skb,
+ * this might cause drops if packet is looped back :
+ * Other socket might not have SOCK_MEMALLOC.
+ * Packets not looped back do not care about pfmemalloc.
+ */
+ skb->pfmemalloc = 0;
+
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -964,6 +983,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_set_hash_from_sk(skb, sk);
atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+ skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
+
/* Build TCP header and checksum it. */
th = (struct tcphdr *)skb->data;
th->source = inet->inet_sport;
@@ -1027,7 +1048,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
/* Our usage of tstamp should remain private */
- skb->tstamp.tv64 = 0;
+ skb->tstamp = 0;
/* Cleanup our debris for IP stacks */
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
@@ -1514,6 +1535,18 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
if (sysctl_tcp_slow_start_after_idle &&
(s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
tcp_cwnd_application_limited(sk);
+
+ /* The following conditions together indicate the starvation
+ * is caused by insufficient sender buffer:
+ * 1) just sent some data (see tcp_write_xmit)
+ * 2) not cwnd limited (this else condition)
+ * 3) no more data to send (null tcp_send_head )
+ * 4) application is hitting buffer limit (SOCK_NOSPACE)
+ */
+ if (!tcp_send_head(sk) && sk->sk_socket &&
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
+ (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+ tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}
@@ -1910,26 +1943,26 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk)
*/
static int tcp_mtu_probe(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb, *nskb, *next;
struct net *net = sock_net(sk);
- int len;
int probe_size;
int size_needed;
- int copy;
+ int copy, len;
int mss_now;
int interval;
/* Not currently probing/verifying,
* not in recovery,
* have enough cwnd, and
- * not SACKing (the variable headers throw things off) */
- if (!icsk->icsk_mtup.enabled ||
- icsk->icsk_mtup.probe_size ||
- inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
- tp->snd_cwnd < 11 ||
- tp->rx_opt.num_sacks || tp->rx_opt.dsack)
+ * not SACKing (the variable headers throw things off)
+ */
+ if (likely(!icsk->icsk_mtup.enabled ||
+ icsk->icsk_mtup.probe_size ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+ tp->snd_cwnd < 11 ||
+ tp->rx_opt.num_sacks || tp->rx_opt.dsack))
return -1;
/* Use binary search for probe_size between tcp_mss_base,
@@ -2069,7 +2102,16 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit <<= factor;
if (atomic_read(&sk->sk_wmem_alloc) > limit) {
- set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
+ /* Always send the 1st or 2nd skb in write queue.
+ * No need to wait for TX completion to call us back,
+ * after softirq/tasklet schedule.
+ * This helps when TX completions are delayed too much.
+ */
+ if (skb == sk->sk_write_queue.next ||
+ skb->prev == sk->sk_write_queue.next)
+ return false;
+
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED, so we must
* test again the condition.
@@ -2081,6 +2123,47 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
return false;
}
+static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
+{
+ const u32 now = tcp_time_stamp;
+
+ if (tp->chrono_type > TCP_CHRONO_UNSPEC)
+ tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start;
+ tp->chrono_start = now;
+ tp->chrono_type = new;
+}
+
+void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* If there are multiple conditions worthy of tracking in a
+ * chronograph then the highest priority enum takes precedence
+ * over the other conditions. So that if something "more interesting"
+ * starts happening, stop the previous chrono and start a new one.
+ */
+ if (type > tp->chrono_type)
+ tcp_chrono_set(tp, type);
+}
+
+void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+
+ /* There are multiple conditions worthy of tracking in a
+ * chronograph, so that the highest priority enum takes
+ * precedence over the other conditions (see tcp_chrono_start).
+ * If a condition stops, we only stop chrono tracking if
+ * it's the "most interesting" or current chrono we are
+ * tracking and starts busy chrono if we have pending data.
+ */
+ if (tcp_write_queue_empty(sk))
+ tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
+ else if (type == tp->chrono_type)
+ tcp_chrono_set(tp, TCP_CHRONO_BUSY);
+}
+
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
@@ -2103,7 +2186,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
int result;
- bool is_cwnd_limited = false;
+ bool is_cwnd_limited = false, is_rwnd_limited = false;
u32 max_segs;
sent_pkts = 0;
@@ -2140,8 +2223,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
break;
}
- if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+ if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
+ is_rwnd_limited = true;
break;
+ }
if (tso_segs == 1) {
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
@@ -2167,6 +2252,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
+ if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
+ clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
if (tcp_small_queue_check(sk, skb, 0))
break;
@@ -2186,6 +2273,11 @@ repair:
break;
}
+ if (is_rwnd_limited)
+ tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
+ else
+ tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
+
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += sent_pkts;
@@ -2207,8 +2299,6 @@ bool tcp_schedule_loss_probe(struct sock *sk)
u32 timeout, tlp_time_stamp, rto_time_stamp;
u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
- if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
- return false;
/* No consecutive loss probes. */
if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
tcp_rearm_rto(sk);
@@ -2227,8 +2317,9 @@ bool tcp_schedule_loss_probe(struct sock *sk)
/* Schedule a loss probe in 2*RTT for SACK capable connections
* in Open state, that are either limited by cwnd or application.
*/
- if (sysctl_tcp_early_retrans < 3 || !tp->packets_out ||
- !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
+ if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) ||
+ !tp->packets_out || !tcp_is_sack(tp) ||
+ icsk->icsk_ca_state != TCP_CA_Open)
return false;
if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
@@ -2436,9 +2527,11 @@ u32 __tcp_select_window(struct sock *sk)
int full_space = min_t(int, tp->window_clamp, allowed_space);
int window;
- if (mss > full_space)
+ if (unlikely(mss > full_space)) {
mss = full_space;
-
+ if (mss <= 0)
+ return 0;
+ }
if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0;
@@ -2514,7 +2607,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
}
/* Collapses two adjacent SKB's during retransmission. */
-static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
+static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
@@ -2525,13 +2618,17 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
+ if (next_skb_size) {
+ if (next_skb_size <= skb_availroom(skb))
+ skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
+ next_skb_size);
+ else if (!skb_shift(skb, next_skb, next_skb_size))
+ return false;
+ }
tcp_highest_sack_combine(sk, next_skb, skb);
tcp_unlink_write_queue(next_skb, sk);
- skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
- next_skb_size);
-
if (next_skb->ip_summed == CHECKSUM_PARTIAL)
skb->ip_summed = CHECKSUM_PARTIAL;
@@ -2560,6 +2657,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
tcp_skb_collapse_tstamp(skb, next_skb);
sk_wmem_free_skb(sk, next_skb);
+ return true;
}
/* Check if coalescing SKBs is legal. */
@@ -2567,14 +2665,11 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
{
if (tcp_skb_pcount(skb) > 1)
return false;
- /* TODO: SACK collapsing could be used to remove this condition */
- if (skb_shinfo(skb)->nr_frags != 0)
- return false;
if (skb_cloned(skb))
return false;
if (skb == tcp_send_head(sk))
return false;
- /* Some heurestics for collapsing over SACK'd could be invented */
+ /* Some heuristics for collapsing over SACK'd could be invented */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
return false;
@@ -2612,16 +2707,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
if (space < 0)
break;
- /* Punt if not enough space exists in the first SKB for
- * the data in the second
- */
- if (skb->len > skb_availroom(to))
- break;
if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
break;
- tcp_collapse_retrans(sk, to);
+ if (!tcp_collapse_retrans(sk, to))
+ break;
}
}
@@ -2694,6 +2785,13 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
tcp_ecn_clear_syn(sk, skb);
+ /* Update global and local TCP statistics. */
+ segs = tcp_skb_pcount(skb);
+ TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ tp->total_retrans += segs;
+
/* make sure skb->data is aligned on arches that require it
* and check if ack-trimming & collapsing extended the headroom
* beyond what csum_start can cover.
@@ -2711,14 +2809,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
}
if (likely(!err)) {
- segs = tcp_skb_pcount(skb);
-
TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
- /* Update global TCP statistics. */
- TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
- tp->total_retrans += segs;
+ } else if (err != -EBUSY) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
return err;
}
@@ -2741,8 +2834,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_skb_timestamp(skb);
- } else if (err != -EBUSY) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
if (tp->undo_retrans < 0)
@@ -2751,36 +2842,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
return err;
}
-/* Check if we forward retransmits are possible in the current
- * window/congestion state.
- */
-static bool tcp_can_forward_retransmit(struct sock *sk)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- const struct tcp_sock *tp = tcp_sk(sk);
-
- /* Forward retransmissions are possible only during Recovery. */
- if (icsk->icsk_ca_state != TCP_CA_Recovery)
- return false;
-
- /* No forward retransmissions in Reno are possible. */
- if (tcp_is_reno(tp))
- return false;
-
- /* Yeah, we have to make difficult choice between forward transmission
- * and retransmission... Both ways have their merits...
- *
- * For now we do not retransmit anything, while we have some new
- * segments to send. In the other cases, follow rule 3 for
- * NextSeg() specified in RFC3517.
- */
-
- if (tcp_may_send_now(sk))
- return false;
-
- return true;
-}
-
/* This gets called after a retransmit timeout, and the initially
* retransmitted data is acknowledged. It tries to continue
* resending the rest of the retransmit queue, until either
@@ -2795,24 +2856,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
struct sk_buff *hole = NULL;
- u32 max_segs, last_lost;
+ u32 max_segs;
int mib_idx;
- int fwd_rexmitting = 0;
if (!tp->packets_out)
return;
- if (!tp->lost_out)
- tp->retransmit_high = tp->snd_una;
-
if (tp->retransmit_skb_hint) {
skb = tp->retransmit_skb_hint;
- last_lost = TCP_SKB_CB(skb)->end_seq;
- if (after(last_lost, tp->retransmit_high))
- last_lost = tp->retransmit_high;
} else {
skb = tcp_write_queue_head(sk);
- last_lost = tp->snd_una;
}
max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
@@ -2835,31 +2888,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
*/
segs = min_t(int, segs, max_segs);
- if (fwd_rexmitting) {
-begin_fwd:
- if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
- break;
- mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
-
- } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
- tp->retransmit_high = last_lost;
- if (!tcp_can_forward_retransmit(sk))
- break;
- /* Backtrack if necessary to non-L'ed skb */
- if (hole) {
- skb = hole;
- hole = NULL;
- }
- fwd_rexmitting = 1;
- goto begin_fwd;
-
+ if (tp->retrans_out >= tp->lost_out) {
+ break;
} else if (!(sacked & TCPCB_LOST)) {
if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
hole = skb;
continue;
} else {
- last_lost = TCP_SKB_CB(skb)->end_seq;
if (icsk->icsk_ca_state != TCP_CA_Loss)
mib_idx = LINUX_MIB_TCPFASTRETRANS;
else
@@ -2880,7 +2916,8 @@ begin_fwd:
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
- if (skb == tcp_write_queue_head(sk))
+ if (skb == tcp_write_queue_head(sk) &&
+ icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto,
TCP_RTO_MAX);
@@ -2962,6 +2999,8 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
{
struct sk_buff *skb;
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
+
/* NOTE: No TCP options attached and we never retransmit this. */
skb = alloc_skb(MAX_TCP_HEADER, priority);
if (!skb) {
@@ -2977,8 +3016,6 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
/* Send it off. */
if (tcp_transmit_skb(sk, skb, 0, priority))
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
-
- TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
}
/* Send a crossed SYN-ACK during socket establishment.
@@ -3037,7 +3074,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct sk_buff *skb;
int tcp_header_size;
struct tcphdr *th;
- u16 user_mss;
int mss;
skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
@@ -3067,10 +3103,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
}
skb_dst_set(skb, dst);
- mss = dst_metric_advmss(dst);
- user_mss = READ_ONCE(tp->rx_opt.user_mss);
- if (user_mss && user_mss < mss)
- mss = user_mss;
+ mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
@@ -3123,7 +3156,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
#endif
/* Do not fool tcpdump (if any), clean our debris */
- skb->tstamp.tv64 = 0;
+ skb->tstamp = 0;
return skb;
}
EXPORT_SYMBOL(tcp_make_synack);
@@ -3176,9 +3209,7 @@ static void tcp_connect_init(struct sock *sk)
if (!tp->window_clamp)
tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
- tp->advmss = dst_metric_advmss(dst);
- if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
- tp->advmss = tp->rx_opt.user_mss;
+ tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
tcp_initialize_rcv_mss(sk);
@@ -3244,31 +3275,19 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
- int syn_loss = 0, space, err = 0;
- unsigned long last_syn_loss = 0;
+ int space, err = 0;
struct sk_buff *syn_data;
tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
- tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
- &syn_loss, &last_syn_loss);
- /* Recurring FO SYN losses: revert to regular handshake temporarily */
- if (syn_loss > 1 &&
- time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
- fo->cookie.len = -1;
- goto fallback;
- }
-
- if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
- fo->cookie.len = -1;
- else if (fo->cookie.len <= 0)
+ if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
goto fallback;
/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
* user-MSS. Reserve maximum option space for middleboxes that add
* private TCP options. The cost is reduced data space in SYN :(
*/
- if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
- tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+ tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
+
space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
MAX_TCP_OPTION_SPACE;
@@ -3300,6 +3319,8 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
fo->copied = space;
tcp_connect_queue_skb(sk, syn_data);
+ if (syn_data->len)
+ tcp_chrono_start(sk, TCP_CHRONO_BUSY);
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
@@ -3464,8 +3485,6 @@ void tcp_send_ack(struct sock *sk)
/* We do not want pure acks influencing TCP Small Queues or fq/pacing
* too much.
* SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
- * We also avoid tcp_wfree() overhead (cache line miss accessing
- * tp->tsq_flags) by using regular sock_wfree()
*/
skb_set_tcp_pure_ack(buff);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index e36df4fcfeba..d8acbd9f477a 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -1,9 +1,33 @@
#include <linux/tcp.h>
#include <net/tcp.h>
-int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
+int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
-/* Marks a packet lost, if some packet sent later has been (s)acked.
+static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_skb_mark_lost_uncond_verify(tp, skb);
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
+ /* Account for retransmits that are lost again */
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
+ tcp_skb_pcount(skb));
+ }
+}
+
+static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
+ const struct skb_mstamp *t2,
+ u32 seq1, u32 seq2)
+{
+ return skb_mstamp_after(t1, t2) ||
+ (t1->v64 == t2->v64 && after(seq1, seq2));
+}
+
+/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
+ *
+ * Marks a packet lost, if some packet sent later has been (s)acked.
* The underlying idea is similar to the traditional dupthresh and FACK
* but they look at different metrics:
*
@@ -16,31 +40,26 @@ int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
* is being more resilient to reordering by simply allowing some
* "settling delay", instead of tweaking the dupthresh.
*
- * The current version is only used after recovery starts but can be
- * easily extended to detect the first loss.
+ * When tcp_rack_detect_loss() detects some packets are lost and we
+ * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
+ * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
+ * make us enter the CA_Recovery state.
*/
-int tcp_rack_mark_lost(struct sock *sk)
+static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
+ u32 *reo_timeout)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- u32 reo_wnd, prior_retrans = tp->retrans_out;
-
- if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
- return 0;
-
- /* Reset the advanced flag to avoid unnecessary queue scanning */
- tp->rack.advanced = 0;
+ u32 reo_wnd;
+ *reo_timeout = 0;
/* To be more reordering resilient, allow min_rtt/4 settling delay
* (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
* RTT because reordering is often a path property and less related
* to queuing or delayed ACKs.
- *
- * TODO: measure and adapt to the observed reordering delay, and
- * use a timer to retransmit like the delayed early retransmit.
*/
reo_wnd = 1000;
- if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
+ if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
tcp_for_write_queue(skb, sk) {
@@ -54,20 +73,29 @@ int tcp_rack_mark_lost(struct sock *sk)
scb->sacked & TCPCB_SACKED_ACKED)
continue;
- if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
+ if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp,
+ tp->rack.end_seq, scb->end_seq)) {
+ /* Step 3 in draft-cheng-tcpm-rack-00.txt:
+ * A packet is lost if its elapsed time is beyond
+ * the recent RTT plus the reordering window.
+ */
+ u32 elapsed = skb_mstamp_us_delta(now,
+ &skb->skb_mstamp);
+ s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
- if (skb_mstamp_us_delta(&tp->rack.mstamp,
- &skb->skb_mstamp) <= reo_wnd)
+ if (remaining < 0) {
+ tcp_rack_mark_skb_lost(sk, skb);
continue;
-
- /* skb is lost if packet sent later is sacked */
- tcp_skb_mark_lost_uncond_verify(tp, skb);
- if (scb->sacked & TCPCB_SACKED_RETRANS) {
- scb->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPLOSTRETRANSMIT);
}
+
+ /* Skip ones marked lost but not yet retransmitted */
+ if ((scb->sacked & TCPCB_LOST) &&
+ !(scb->sacked & TCPCB_SACKED_RETRANS))
+ continue;
+
+ /* Record maximum wait time (+1 to avoid 0) */
+ *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
+
} else if (!(scb->sacked & TCPCB_RETRANS)) {
/* Original data are sent sequentially so stop early
* b/c the rest are all sent after rack_sent
@@ -75,20 +103,43 @@ int tcp_rack_mark_lost(struct sock *sk)
break;
}
}
- return prior_retrans - tp->retrans_out;
}
-/* Record the most recently (re)sent time among the (s)acked packets */
-void tcp_rack_advance(struct tcp_sock *tp,
- const struct skb_mstamp *xmit_time, u8 sacked)
+void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 timeout;
+
+ if (!tp->rack.advanced)
+ return;
+
+ /* Reset the advanced flag to avoid unnecessary queue scanning */
+ tp->rack.advanced = 0;
+ tcp_rack_detect_loss(sk, now, &timeout);
+ if (timeout) {
+ timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
+ timeout, inet_csk(sk)->icsk_rto);
+ }
+}
+
+/* Record the most recently (re)sent time among the (s)acked packets
+ * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
+ * draft-cheng-tcpm-rack-00.txt
+ */
+void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
+ const struct skb_mstamp *xmit_time,
+ const struct skb_mstamp *ack_time)
{
+ u32 rtt_us;
+
if (tp->rack.mstamp.v64 &&
- !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
+ !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp,
+ end_seq, tp->rack.end_seq))
return;
+ rtt_us = skb_mstamp_us_delta(ack_time, xmit_time);
if (sacked & TCPCB_RETRANS) {
- struct skb_mstamp now;
-
/* If the sacked packet was retransmitted, it's ambiguous
* whether the retransmission or the original (or the prior
* retransmission) was sacked.
@@ -99,11 +150,35 @@ void tcp_rack_advance(struct tcp_sock *tp,
* so it's at least one RTT (i.e., retransmission is at least
* an RTT later).
*/
- skb_mstamp_get(&now);
- if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
+ if (rtt_us < tcp_min_rtt(tp))
return;
}
-
+ tp->rack.rtt_us = rtt_us;
tp->rack.mstamp = *xmit_time;
+ tp->rack.end_seq = end_seq;
tp->rack.advanced = 1;
}
+
+/* We have waited long enough to accommodate reordering. Mark the expired
+ * packets lost and retransmit them.
+ */
+void tcp_rack_reo_timeout(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct skb_mstamp now;
+ u32 timeout, prior_inflight;
+
+ skb_mstamp_get(&now);
+ prior_inflight = tcp_packets_in_flight(tp);
+ tcp_rack_detect_loss(sk, &now, &timeout);
+ if (prior_inflight != tcp_packets_in_flight(tp)) {
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
+ tcp_enter_recovery(sk, false);
+ if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+ tcp_cwnd_reduction(sk, 1, 0);
+ }
+ tcp_xmit_retransmit_queue(sk);
+ }
+ if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
+ tcp_rearm_rto(sk);
+}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index bf5ea9e9bbc1..f2123075ce6e 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -15,6 +15,10 @@
#define TCP_SCALABLE_AI_CNT 50U
#define TCP_SCALABLE_MD_SCALE 3
+struct scalable {
+ u32 loss_cwnd;
+};
+
static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -32,12 +36,23 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
static u32 tcp_scalable_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
+ struct scalable *ca = inet_csk_ca(sk);
+
+ ca->loss_cwnd = tp->snd_cwnd;
return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
}
+static u32 tcp_scalable_cwnd_undo(struct sock *sk)
+{
+ const struct scalable *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
static struct tcp_congestion_ops tcp_scalable __read_mostly = {
.ssthresh = tcp_scalable_ssthresh,
+ .undo_cwnd = tcp_scalable_cwnd_undo,
.cong_avoid = tcp_scalable_cong_avoid,
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3ea1cf804748..b2ab411c6d37 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -249,7 +249,8 @@ void tcp_delack_timer_handler(struct sock *sk)
sk_mem_reclaim_partial(sk);
- if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
+ if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
+ !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
goto out;
if (time_after(icsk->icsk_ack.timeout, jiffies)) {
@@ -310,7 +311,7 @@ static void tcp_delack_timer(unsigned long data)
inet_csk(sk)->icsk_ack.blocked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
/* deleguate our work to tcp_release_cb() */
- if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
@@ -552,7 +553,8 @@ void tcp_write_timer_handler(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
int event;
- if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
+ if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
+ !icsk->icsk_pending)
goto out;
if (time_after(icsk->icsk_timeout, jiffies)) {
@@ -563,8 +565,8 @@ void tcp_write_timer_handler(struct sock *sk)
event = icsk->icsk_pending;
switch (event) {
- case ICSK_TIME_EARLY_RETRANS:
- tcp_resume_early_retransmit(sk);
+ case ICSK_TIME_REO_TIMEOUT:
+ tcp_rack_reo_timeout(sk);
break;
case ICSK_TIME_LOSS_PROBE:
tcp_send_loss_probe(sk);
@@ -592,7 +594,7 @@ static void tcp_write_timer(unsigned long data)
tcp_write_timer_handler(sk);
} else {
/* delegate our work to tcp_release_cb() */
- if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
@@ -617,6 +619,7 @@ void tcp_set_keepalive(struct sock *sk, int val)
else if (!val)
inet_csk_delete_keepalive_timer(sk);
}
+EXPORT_SYMBOL_GPL(tcp_set_keepalive);
static void tcp_keepalive_timer (unsigned long data)
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 4c4bac1b5eab..218cfcc77650 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -307,6 +307,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
static struct tcp_congestion_ops tcp_vegas __read_mostly = {
.init = tcp_vegas_init,
.ssthresh = tcp_reno_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_vegas_cong_avoid,
.pkts_acked = tcp_vegas_pkts_acked,
.set_state = tcp_vegas_state,
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 40171e163cff..76005d4b8dfc 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -30,6 +30,7 @@ struct veno {
u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */
u32 inc; /* decide whether to increase cwnd */
u32 diff; /* calculate the diff rate */
+ u32 loss_cwnd; /* cwnd when loss occured */
};
/* There are several situations when we must "re-start" Veno:
@@ -193,6 +194,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
struct veno *veno = inet_csk_ca(sk);
+ veno->loss_cwnd = tp->snd_cwnd;
if (veno->diff < beta)
/* in "non-congestive state", cut cwnd by 1/5 */
return max(tp->snd_cwnd * 4 / 5, 2U);
@@ -201,9 +203,17 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
return max(tp->snd_cwnd >> 1U, 2U);
}
+static u32 tcp_veno_cwnd_undo(struct sock *sk)
+{
+ const struct veno *veno = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, veno->loss_cwnd);
+}
+
static struct tcp_congestion_ops tcp_veno __read_mostly = {
.init = tcp_veno_init,
.ssthresh = tcp_veno_ssthresh,
+ .undo_cwnd = tcp_veno_cwnd_undo,
.cong_avoid = tcp_veno_cong_avoid,
.pkts_acked = tcp_veno_pkts_acked,
.set_state = tcp_veno_state,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 4b03a2e2a050..fed66dc0e0f5 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -278,6 +278,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = {
.init = tcp_westwood_init,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cwnd_event = tcp_westwood_event,
.in_ack_event = tcp_westwood_ack,
.get_info = tcp_westwood_info,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 9c5fc973267f..e6ff99c4bd3b 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -37,6 +37,7 @@ struct yeah {
u32 fast_count;
u32 pkts_acked;
+ u32 loss_cwnd;
};
static void tcp_yeah_init(struct sock *sk)
@@ -219,13 +220,22 @@ static u32 tcp_yeah_ssthresh(struct sock *sk)
yeah->fast_count = 0;
yeah->reno_count = max(yeah->reno_count>>1, 2U);
+ yeah->loss_cwnd = tp->snd_cwnd;
return max_t(int, tp->snd_cwnd - reduction, 2);
}
+static u32 tcp_yeah_cwnd_undo(struct sock *sk)
+{
+ const struct yeah *yeah = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, yeah->loss_cwnd);
+}
+
static struct tcp_congestion_ops tcp_yeah __read_mostly = {
.init = tcp_yeah_init,
.ssthresh = tcp_yeah_ssthresh,
+ .undo_cwnd = tcp_yeah_cwnd_undo,
.cong_avoid = tcp_yeah_cong_avoid,
.set_state = tcp_vegas_state,
.cwnd_event = tcp_vegas_cwnd_event,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5bab6c3f7a2f..ea6e4cff9faf 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -79,7 +79,7 @@
#define pr_fmt(fmt) "UDP: " fmt
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <linux/bootmem.h>
#include <linux/highmem.h>
@@ -134,14 +134,21 @@ EXPORT_SYMBOL(udp_memory_allocated);
#define MAX_UDP_PORTS 65536
#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
+/* IPCB reference means this can not be used from early demux */
+static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ if (!net->ipv4.sysctl_udp_l3mdev_accept &&
+ skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
+ return true;
+#endif
+ return false;
+}
+
static int udp_lib_lport_inuse(struct net *net, __u16 num,
const struct udp_hslot *hslot,
unsigned long *bitmap,
- struct sock *sk,
- int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard),
- unsigned int log)
+ struct sock *sk, unsigned int log)
{
struct sock *sk2;
kuid_t uid = sock_i_uid(sk);
@@ -153,13 +160,18 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
(!sk2->sk_reuse || !sk->sk_reuse) &&
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
- (!sk2->sk_reuseport || !sk->sk_reuseport ||
- rcu_access_pointer(sk->sk_reuseport_cb) ||
- !uid_eq(uid, sock_i_uid(sk2))) &&
- saddr_comp(sk, sk2, true)) {
- if (!bitmap)
- return 1;
- __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap);
+ inet_rcv_saddr_equal(sk, sk2, true)) {
+ if (sk2->sk_reuseport && sk->sk_reuseport &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
+ uid_eq(uid, sock_i_uid(sk2))) {
+ if (!bitmap)
+ return 0;
+ } else {
+ if (!bitmap)
+ return 1;
+ __set_bit(udp_sk(sk2)->udp_port_hash >> log,
+ bitmap);
+ }
}
}
return 0;
@@ -171,10 +183,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
*/
static int udp_lib_lport_inuse2(struct net *net, __u16 num,
struct udp_hslot *hslot2,
- struct sock *sk,
- int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard))
+ struct sock *sk)
{
struct sock *sk2;
kuid_t uid = sock_i_uid(sk);
@@ -188,11 +197,14 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
(!sk2->sk_reuse || !sk->sk_reuse) &&
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
- (!sk2->sk_reuseport || !sk->sk_reuseport ||
- rcu_access_pointer(sk->sk_reuseport_cb) ||
- !uid_eq(uid, sock_i_uid(sk2))) &&
- saddr_comp(sk, sk2, true)) {
- res = 1;
+ inet_rcv_saddr_equal(sk, sk2, true)) {
+ if (sk2->sk_reuseport && sk->sk_reuseport &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
+ uid_eq(uid, sock_i_uid(sk2))) {
+ res = 0;
+ } else {
+ res = 1;
+ }
break;
}
}
@@ -200,10 +212,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
return res;
}
-static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
- int (*saddr_same)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard))
+static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
{
struct net *net = sock_net(sk);
kuid_t uid = sock_i_uid(sk);
@@ -217,7 +226,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
(udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
(sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
- (*saddr_same)(sk, sk2, false)) {
+ inet_rcv_saddr_equal(sk, sk2, false)) {
return reuseport_add_sock(sk, sk2);
}
}
@@ -233,14 +242,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
*
* @sk: socket struct in question
* @snum: port number to look up
- * @saddr_comp: AF-dependent comparison of bound local IP addresses
* @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
* with NULL address
*/
int udp_lib_get_port(struct sock *sk, unsigned short snum,
- int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard),
unsigned int hash2_nulladdr)
{
struct udp_hslot *hslot, *hslot2;
@@ -269,7 +274,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
bitmap_zero(bitmap, PORTS_PER_CHAIN);
spin_lock_bh(&hslot->lock);
udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
- saddr_comp, udptable->log);
+ udptable->log);
snum = first;
/*
@@ -285,6 +290,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
snum += rand;
} while (snum != first);
spin_unlock_bh(&hslot->lock);
+ cond_resched();
} while (++first != last);
goto fail;
} else {
@@ -301,12 +307,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
if (hslot->count < hslot2->count)
goto scan_primary_hash;
- exist = udp_lib_lport_inuse2(net, snum, hslot2,
- sk, saddr_comp);
+ exist = udp_lib_lport_inuse2(net, snum, hslot2, sk);
if (!exist && (hash2_nulladdr != slot2)) {
hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
exist = udp_lib_lport_inuse2(net, snum, hslot2,
- sk, saddr_comp);
+ sk);
}
if (exist)
goto fail_unlock;
@@ -314,8 +319,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
goto found;
}
scan_primary_hash:
- if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
- saddr_comp, 0))
+ if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0))
goto fail_unlock;
}
found:
@@ -324,7 +328,7 @@ found:
udp_sk(sk)->udp_portaddr_hash ^= snum;
if (sk_unhashed(sk)) {
if (sk->sk_reuseport &&
- udp_reuseport_add_sock(sk, hslot, saddr_comp)) {
+ udp_reuseport_add_sock(sk, hslot)) {
inet_sk(sk)->inet_num = 0;
udp_sk(sk)->udp_port_hash = 0;
udp_sk(sk)->udp_portaddr_hash ^= snum;
@@ -356,24 +360,6 @@ fail:
}
EXPORT_SYMBOL(udp_lib_get_port);
-/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
- * match_wildcard == false: addresses must be exactly the same, i.e.
- * 0.0.0.0 only equals to 0.0.0.0
- */
-int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
- bool match_wildcard)
-{
- struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
-
- if (!ipv6_only_sock(sk2)) {
- if (inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)
- return 1;
- if (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr)
- return match_wildcard;
- }
- return 0;
-}
-
static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
unsigned int port)
{
@@ -389,12 +375,13 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
/* precompute partial secondary hash */
udp_sk(sk)->udp_portaddr_hash = hash2_partial;
- return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
+ return udp_lib_get_port(sk, snum, hash2_nulladdr);
}
static int compute_score(struct sock *sk, struct net *net,
__be32 saddr, __be16 sport,
- __be32 daddr, unsigned short hnum, int dif)
+ __be32 daddr, unsigned short hnum, int dif,
+ bool exact_dif)
{
int score;
struct inet_sock *inet;
@@ -425,7 +412,7 @@ static int compute_score(struct sock *sk, struct net *net,
score += 4;
}
- if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if || exact_dif) {
if (sk->sk_bound_dev_if != dif)
return -1;
score += 4;
@@ -450,7 +437,7 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
/* called with rcu_read_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
__be32 saddr, __be16 sport,
- __be32 daddr, unsigned int hnum, int dif,
+ __be32 daddr, unsigned int hnum, int dif, bool exact_dif,
struct udp_hslot *hslot2,
struct sk_buff *skb)
{
@@ -462,7 +449,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
badness = 0;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif);
+ daddr, hnum, dif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -497,6 +484,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
unsigned short hnum = ntohs(dport);
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
+ bool exact_dif = udp_lib_exact_dif_match(net, skb);
int score, badness, matches = 0, reuseport = 0;
u32 hash = 0;
@@ -509,7 +497,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
- hslot2, skb);
+ exact_dif, hslot2, skb);
if (!result) {
unsigned int old_slot2 = slot2;
hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
@@ -524,7 +512,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
- hslot2, skb);
+ exact_dif, hslot2, skb);
}
return result;
}
@@ -533,7 +521,7 @@ begin:
badness = 0;
sk_for_each_rcu(sk, &hslot->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif);
+ daddr, hnum, dif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -580,7 +568,8 @@ EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb);
* Does increment socket refcount.
*/
#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
- IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY)
+ IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \
+ IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{
@@ -1019,7 +1008,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
flow_flags,
- faddr, saddr, dport, inet->inet_sport);
+ faddr, saddr, dport, inet->inet_sport,
+ sk->sk_uid);
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
@@ -1111,7 +1101,8 @@ out:
return err;
do_confirm:
- dst_confirm(&rt->dst);
+ if (msg->msg_flags & MSG_PROBE)
+ dst_confirm_neigh(&rt->dst, &fl4->daddr);
if (!(msg->msg_flags&MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
@@ -1172,6 +1163,181 @@ out:
return ret;
}
+/* fully reclaim rmem/fwd memory allocated for skb */
+static void udp_rmem_release(struct sock *sk, int size, int partial)
+{
+ struct udp_sock *up = udp_sk(sk);
+ int amt;
+
+ if (likely(partial)) {
+ up->forward_deficit += size;
+ size = up->forward_deficit;
+ if (size < (sk->sk_rcvbuf >> 2) &&
+ !skb_queue_empty(&sk->sk_receive_queue))
+ return;
+ } else {
+ size += up->forward_deficit;
+ }
+ up->forward_deficit = 0;
+
+ sk->sk_forward_alloc += size;
+ amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
+ sk->sk_forward_alloc -= amt;
+
+ if (amt)
+ __sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
+
+ atomic_sub(size, &sk->sk_rmem_alloc);
+}
+
+/* Note: called with sk_receive_queue.lock held.
+ * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch
+ * This avoids a cache line miss while receive_queue lock is held.
+ * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
+ */
+void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
+{
+ udp_rmem_release(sk, skb->dev_scratch, 1);
+}
+EXPORT_SYMBOL(udp_skb_destructor);
+
+/* Idea of busylocks is to let producers grab an extra spinlock
+ * to relieve pressure on the receive_queue spinlock shared by consumer.
+ * Under flood, this means that only one producer can be in line
+ * trying to acquire the receive_queue spinlock.
+ * These busylock can be allocated on a per cpu manner, instead of a
+ * per socket one (that would consume a cache line per socket)
+ */
+static int udp_busylocks_log __read_mostly;
+static spinlock_t *udp_busylocks __read_mostly;
+
+static spinlock_t *busylock_acquire(void *ptr)
+{
+ spinlock_t *busy;
+
+ busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log);
+ spin_lock(busy);
+ return busy;
+}
+
+static void busylock_release(spinlock_t *busy)
+{
+ if (busy)
+ spin_unlock(busy);
+}
+
+int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff_head *list = &sk->sk_receive_queue;
+ int rmem, delta, amt, err = -ENOMEM;
+ spinlock_t *busy = NULL;
+ int size;
+
+ /* try to avoid the costly atomic add/sub pair when the receive
+ * queue is full; always allow at least a packet
+ */
+ rmem = atomic_read(&sk->sk_rmem_alloc);
+ if (rmem > sk->sk_rcvbuf)
+ goto drop;
+
+ /* Under mem pressure, it might be helpful to help udp_recvmsg()
+ * having linear skbs :
+ * - Reduce memory overhead and thus increase receive queue capacity
+ * - Less cache line misses at copyout() time
+ * - Less work at consume_skb() (less alien page frag freeing)
+ */
+ if (rmem > (sk->sk_rcvbuf >> 1)) {
+ skb_condense(skb);
+
+ busy = busylock_acquire(sk);
+ }
+ size = skb->truesize;
+ /* Copy skb->truesize into skb->dev_scratch to avoid a cache line miss
+ * in udp_skb_destructor()
+ */
+ skb->dev_scratch = size;
+
+ /* we drop only if the receive buf is full and the receive
+ * queue contains some other skb
+ */
+ rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
+ if (rmem > (size + sk->sk_rcvbuf))
+ goto uncharge_drop;
+
+ spin_lock(&list->lock);
+ if (size >= sk->sk_forward_alloc) {
+ amt = sk_mem_pages(size);
+ delta = amt << SK_MEM_QUANTUM_SHIFT;
+ if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) {
+ err = -ENOBUFS;
+ spin_unlock(&list->lock);
+ goto uncharge_drop;
+ }
+
+ sk->sk_forward_alloc += delta;
+ }
+
+ sk->sk_forward_alloc -= size;
+
+ /* no need to setup a destructor, we will explicitly release the
+ * forward allocated memory on dequeue
+ */
+ sock_skb_set_dropcount(sk, skb);
+
+ __skb_queue_tail(list, skb);
+ spin_unlock(&list->lock);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+
+ busylock_release(busy);
+ return 0;
+
+uncharge_drop:
+ atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+
+drop:
+ atomic_inc(&sk->sk_drops);
+ busylock_release(busy);
+ return err;
+}
+EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
+
+void udp_destruct_sock(struct sock *sk)
+{
+ /* reclaim completely the forward allocated memory */
+ unsigned int total = 0;
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ total += skb->truesize;
+ kfree_skb(skb);
+ }
+ udp_rmem_release(sk, total, 0);
+
+ inet_sock_destruct(sk);
+}
+EXPORT_SYMBOL_GPL(udp_destruct_sock);
+
+int udp_init_sock(struct sock *sk)
+{
+ sk->sk_destruct = udp_destruct_sock;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(udp_init_sock);
+
+void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
+{
+ if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) {
+ bool slow = lock_sock_fast(sk);
+
+ sk_peek_offset_bwd(sk, len);
+ unlock_sock_fast(sk, slow);
+ }
+ consume_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_consume_udp);
+
/**
* first_packet_length - return length of first packet in receive queue
* @sk: socket
@@ -1181,12 +1347,11 @@ out:
*/
static int first_packet_length(struct sock *sk)
{
- struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue;
+ struct sk_buff_head *rcvq = &sk->sk_receive_queue;
struct sk_buff *skb;
+ int total = 0;
int res;
- __skb_queue_head_init(&list_kill);
-
spin_lock_bh(&rcvq->lock);
while ((skb = skb_peek(rcvq)) != NULL &&
udp_lib_checksum_complete(skb)) {
@@ -1196,18 +1361,13 @@ static int first_packet_length(struct sock *sk)
IS_UDPLITE(sk));
atomic_inc(&sk->sk_drops);
__skb_unlink(skb, rcvq);
- __skb_queue_tail(&list_kill, skb);
+ total += skb->truesize;
+ kfree_skb(skb);
}
res = skb ? skb->len : -1;
+ if (total)
+ udp_rmem_release(sk, total, 1);
spin_unlock_bh(&rcvq->lock);
-
- if (!skb_queue_empty(&list_kill)) {
- bool slow = lock_sock_fast(sk);
-
- __skb_queue_purge(&list_kill);
- sk_mem_reclaim_partial(sk);
- unlock_sock_fast(sk, slow);
- }
return res;
}
@@ -1256,15 +1416,13 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int err;
int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false;
- bool slow;
if (flags & MSG_ERRQUEUE)
return ip_recv_error(sk, msg, len, addr_len);
try_again:
peeking = off = sk_peek_offset(sk, flags);
- skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
- &peeked, &off, &err);
+ skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
if (!skb)
return err;
@@ -1281,7 +1439,8 @@ try_again:
* coverage checksum (UDP-Lite), do it before the copy.
*/
- if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
+ if (copied < ulen || peeking ||
+ (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
checksum_valid = !udp_lib_checksum_complete(skb);
if (!checksum_valid)
goto csum_copy_err;
@@ -1297,13 +1456,12 @@ try_again:
}
if (unlikely(err)) {
- trace_kfree_skb(skb, udp_recvmsg);
if (!peeked) {
atomic_inc(&sk->sk_drops);
UDP_INC_STATS(sock_net(sk),
UDP_MIB_INERRORS, is_udplite);
}
- skb_free_datagram_locked(sk, skb);
+ kfree_skb(skb);
return err;
}
@@ -1322,22 +1480,21 @@ try_again:
*addr_len = sizeof(*sin);
}
if (inet->cmsg_flags)
- ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr), off);
+ ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
err = copied;
if (flags & MSG_TRUNC)
err = ulen;
- __skb_free_datagram_locked(sk, skb, peeking ? -err : err);
+ skb_consume_udp(sk, skb, peeking ? -err : err);
return err;
csum_copy_err:
- slow = lock_sock_fast(sk);
- if (!skb_kill_datagram(sk, skb, flags)) {
+ if (!__sk_queue_drop_skb(sk, skb, flags, udp_skb_destructor)) {
UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
}
- unlock_sock_fast(sk, slow);
+ kfree_skb(skb);
/* starting over for a new packet, but check if we need to yield */
cond_resched();
@@ -1463,9 +1620,11 @@ int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
sk_incoming_cpu_update(sk);
+ } else {
+ sk_mark_napi_id_once(sk, skb);
}
- rc = __sock_queue_rcv_skb(sk, skb);
+ rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
@@ -1480,7 +1639,6 @@ int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
return 0;
-
}
static struct static_key udp_encap_needed __read_mostly;
@@ -1502,7 +1660,6 @@ EXPORT_SYMBOL(udp_encap_enable);
int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
- int rc;
int is_udplite = IS_UDPLITE(sk);
/*
@@ -1589,25 +1746,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto drop;
udp_csum_pull_header(skb);
- if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
- __UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
- is_udplite);
- goto drop;
- }
-
- rc = 0;
ipv4_pktinfo_prepare(sk, skb);
- bh_lock_sock(sk);
- if (!sock_owned_by_user(sk))
- rc = __udp_queue_rcv_skb(sk, skb);
- else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
- bh_unlock_sock(sk);
- goto drop;
- }
- bh_unlock_sock(sk);
-
- return rc;
+ return __udp_queue_rcv_skb(sk, skb);
csum_error:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
@@ -2217,13 +2358,13 @@ struct proto udp_prot = {
.connect = ip4_datagram_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
+ .init = udp_init_sock,
.destroy = udp_destroy_sock,
.setsockopt = udp_setsockopt,
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
- .backlog_rcv = __udp_queue_rcv_skb,
.release_cb = ip4_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
@@ -2512,6 +2653,7 @@ EXPORT_SYMBOL(udp_flow_hashrnd);
void __init udp_init(void)
{
unsigned long limit;
+ unsigned int i;
udp_table_init(&udp_table, "UDP");
limit = nr_free_buffer_pages() / 8;
@@ -2522,4 +2664,13 @@ void __init udp_init(void)
sysctl_udp_rmem_min = SK_MEM_QUANTUM;
sysctl_udp_wmem_min = SK_MEM_QUANTUM;
+
+ /* 16 spinlocks per cpu */
+ udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
+ udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log,
+ GFP_KERNEL);
+ if (!udp_busylocks)
+ panic("UDP: failed to alloc udp_busylocks\n");
+ for (i = 0; i < (1U << udp_busylocks_log); i++)
+ spin_lock_init(udp_busylocks + i);
}
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index ff450c2aad9b..59f10fe9782e 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -50,10 +50,11 @@ struct proto udplite_prot = {
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
- .backlog_rcv = __udp_queue_rcv_skb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.get_port = udp_v4_get_port,
+ .memory_allocated = &udp_memory_allocated,
+ .sysctl_mem = sysctl_udp_mem,
.obj_size = sizeof(struct udp_sock),
.h.udp_table = &udplite_table,
#ifdef CONFIG_COMPAT
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 62e1e72db461..1fc684111ce6 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -40,6 +40,7 @@ drop:
int xfrm4_transport_finish(struct sk_buff *skb, int async)
{
+ struct xfrm_offload *xo = xfrm_offload(skb);
struct iphdr *iph = ip_hdr(skb);
iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
@@ -53,6 +54,11 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
iph->tot_len = htons(skb->len);
ip_send_check(iph);
+ if (xo && (xo->flags & XFRM_GRO)) {
+ skb_mac_header_rebuild(skb);
+ return 0;
+ }
+
NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
dev_net(skb->dev), NULL, skb, skb->dev, NULL,
xfrm4_rcv_encap_finish);
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index fd840c7d75ea..4acc0508c5eb 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -43,6 +43,7 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
{
int ihl = skb->data - skb_transport_header(skb);
+ struct xfrm_offload *xo = xfrm_offload(skb);
if (skb->transport_header != skb->network_header) {
memmove(skb_transport_header(skb),
@@ -50,7 +51,8 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
skb->network_header = skb->transport_header;
}
ip_hdr(skb)->tot_len = htons(skb->len + ihl);
- skb_reset_transport_header(skb);
+ if (!xo || !(xo->flags & XFRM_GRO))
+ skb_reset_transport_header(skb);
return 0;
}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 6a7ff6957535..71b4ecc195c7 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -17,8 +17,6 @@
#include <net/ip.h>
#include <net/l3mdev.h>
-static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
-
static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
int tos, int oif,
const xfrm_address_t *saddr,
@@ -219,7 +217,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
{
struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
- xfrm4_policy_afinfo.garbage_collect(net);
+ xfrm_garbage_collect_deferred(net);
return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
}
@@ -271,8 +269,7 @@ static struct dst_ops xfrm4_dst_ops_template = {
.gc_thresh = INT_MAX,
};
-static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
- .family = AF_INET,
+static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
.dst_ops = &xfrm4_dst_ops_template,
.dst_lookup = xfrm4_dst_lookup,
.get_saddr = xfrm4_get_saddr,
@@ -376,7 +373,7 @@ static struct pernet_operations __net_initdata xfrm4_net_ops = {
static void __init xfrm4_policy_init(void)
{
- xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
+ xfrm_policy_register_afinfo(&xfrm4_policy_afinfo, AF_INET);
}
void __init xfrm4_init(void)
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index dccefa9d84cf..8dd0e6ab8606 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -188,9 +188,8 @@ static const struct net_protocol ipcomp4_protocol = {
.netns_ok = 1,
};
-static struct xfrm_input_afinfo xfrm4_input_afinfo = {
+static const struct xfrm_input_afinfo xfrm4_input_afinfo = {
.family = AF_INET,
- .owner = THIS_MODULE,
.callback = xfrm4_rcv_cb,
};
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 542074c00c78..d6660a8c0ea5 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -90,11 +90,3 @@ void __init xfrm4_state_init(void)
{
xfrm_state_register_afinfo(&xfrm4_state_afinfo);
}
-
-#if 0
-void __exit xfrm4_state_fini(void)
-{
- xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
-}
-#endif /* 0 */
-
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 2343e4f2e0bf..e2afe677a9d9 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -75,6 +75,19 @@ config INET6_ESP
If unsure, say Y.
+config INET6_ESP_OFFLOAD
+ tristate "IPv6: ESP transformation offload"
+ depends on INET6_ESP
+ select XFRM_OFFLOAD
+ default n
+ ---help---
+ Support for ESP transformation offload. This makes sense
+ only if this system really does IPsec and want to do it
+ with high throughput. A typical desktop system does not
+ need it, even if it does IPsec.
+
+ If unsure, say N.
+
config INET6_IPCOMP
tristate "IPv6: IPComp transformation"
select INET6_XFRM_TUNNEL
@@ -208,6 +221,7 @@ config IPV6_TUNNEL
tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)"
select INET6_TUNNEL
select DST_CACHE
+ select GRO_CELLS
---help---
Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in
RFC 2473.
@@ -289,4 +303,39 @@ config IPV6_PIMSM_V2
Support for IPv6 PIM multicast routing protocol PIM-SMv2.
If unsure, say N.
+config IPV6_SEG6_LWTUNNEL
+ bool "IPv6: Segment Routing Header encapsulation support"
+ depends on IPV6
+ select LWTUNNEL
+ ---help---
+ Support for encapsulation of packets within an outer IPv6
+ header and a Segment Routing Header using the lightweight
+ tunnels mechanism.
+
+ If unsure, say N.
+
+config IPV6_SEG6_INLINE
+ bool "IPv6: direct Segment Routing Header insertion "
+ depends on IPV6_SEG6_LWTUNNEL
+ ---help---
+ Support for direct insertion of the Segment Routing Header,
+ also known as inline mode. Be aware that direct insertion of
+ extension headers (as opposed to encapsulation) may break
+ multiple mechanisms such as PMTUD or IPSec AH. Use this feature
+ only if you know exactly what you are doing.
+
+ If unsure, say N.
+
+config IPV6_SEG6_HMAC
+ bool "IPv6: Segment Routing HMAC support"
+ depends on IPV6
+ select CRYPTO_HMAC
+ select CRYPTO_SHA1
+ select CRYPTO_SHA256
+ ---help---
+ Support for HMAC signature generation and verification
+ of SR-enabled packets.
+
+ If unsure, say N.
+
endif # IPV6
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index c174ccb340a1..217e9ff0e24b 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -9,7 +9,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
- udp_offload.o
+ udp_offload.o seg6.o
ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o
@@ -23,11 +23,14 @@ ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
ipv6-$(CONFIG_PROC_FS) += proc.o
ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
ipv6-$(CONFIG_NETLABEL) += calipso.o
+ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o
+ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o
ipv6-objs += $(ipv6-y)
obj-$(CONFIG_INET6_AH) += ah6.o
obj-$(CONFIG_INET6_ESP) += esp6.o
+obj-$(CONFIG_INET6_ESP_OFFLOAD) += esp6_offload.o
obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o
obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4bc5ba3ae452..80ce478c4851 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -43,6 +43,7 @@
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/kernel.h>
+#include <linux/sched/signal.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
@@ -238,6 +239,12 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.use_oif_addrs_only = 0,
.ignore_routes_with_linkdown = 0,
.keep_addr_on_down = 0,
+ .seg6_enabled = 0,
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ .seg6_require_hmac = 0,
+#endif
+ .enhanced_dad = 1,
+ .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64,
};
static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -284,6 +291,12 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.use_oif_addrs_only = 0,
.ignore_routes_with_linkdown = 0,
.keep_addr_on_down = 0,
+ .seg6_enabled = 0,
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ .seg6_require_hmac = 0,
+#endif
+ .enhanced_dad = 1,
+ .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64,
};
/* Check if a valid qdisc is available */
@@ -376,9 +389,9 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));
if (ndev->cnf.stable_secret.initialized)
- ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+ ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
else
- ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64;
+ ndev->cnf.addr_gen_mode = ipv6_devconf_dflt.addr_gen_mode;
ndev->cnf.mtu6 = dev->mtu;
ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
@@ -2134,12 +2147,14 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
case ARPHRD_SIT:
return addrconf_ifid_sit(eui, dev);
case ARPHRD_IPGRE:
+ case ARPHRD_TUNNEL:
return addrconf_ifid_gre(eui, dev);
case ARPHRD_6LOWPAN:
return addrconf_ifid_eui64(eui, dev);
case ARPHRD_IEEE1394:
return addrconf_ifid_ieee1394(eui, dev);
case ARPHRD_TUNNEL6:
+ case ARPHRD_IP6GRE:
return addrconf_ifid_ip6tnl(eui, dev);
}
return -1;
@@ -2377,8 +2392,8 @@ static void manage_tempaddrs(struct inet6_dev *idev,
static bool is_addr_mode_generate_stable(struct inet6_dev *idev)
{
- return idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY ||
- idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
+ return idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY ||
+ idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
}
int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
@@ -3142,7 +3157,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
- switch (idev->addr_gen_mode) {
+ switch (idev->cnf.addr_gen_mode) {
case IN6_ADDR_GEN_MODE_RANDOM:
ipv6_gen_mode_random_init(idev);
/* fallthrough */
@@ -3183,6 +3198,9 @@ static void addrconf_dev_config(struct net_device *dev)
(dev->type != ARPHRD_IEEE1394) &&
(dev->type != ARPHRD_TUNNEL6) &&
(dev->type != ARPHRD_6LOWPAN) &&
+ (dev->type != ARPHRD_IP6GRE) &&
+ (dev->type != ARPHRD_IPGRE) &&
+ (dev->type != ARPHRD_TUNNEL) &&
(dev->type != ARPHRD_NONE)) {
/* Alas, we support only Ethernet autoconfiguration. */
return;
@@ -3194,8 +3212,8 @@ static void addrconf_dev_config(struct net_device *dev)
/* this device type has no EUI support */
if (dev->type == ARPHRD_NONE &&
- idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)
- idev->addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM;
+ idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)
+ idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM;
addrconf_addr_gen(idev, false);
}
@@ -3376,9 +3394,15 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
}
if (idev) {
- if (idev->if_flags & IF_READY)
- /* device is already configured. */
+ if (idev->if_flags & IF_READY) {
+ /* device is already configured -
+ * but resend MLD reports, we might
+ * have roamed and need to update
+ * multicast snooping switches
+ */
+ ipv6_mc_up(idev);
break;
+ }
idev->if_flags |= IF_READY;
}
@@ -3602,14 +3626,19 @@ restart:
INIT_LIST_HEAD(&del_list);
list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
struct rt6_info *rt = NULL;
+ bool keep;
addrconf_del_dad_work(ifa);
+ keep = keep_addr && (ifa->flags & IFA_F_PERMANENT) &&
+ !addr_is_local(&ifa->addr);
+ if (!keep)
+ list_move(&ifa->if_list, &del_list);
+
write_unlock_bh(&idev->lock);
spin_lock_bh(&ifa->lock);
- if (keep_addr && (ifa->flags & IFA_F_PERMANENT) &&
- !addr_is_local(&ifa->addr)) {
+ if (keep) {
/* set state to skip the notifier below */
state = INET6_IFADDR_STATE_DEAD;
ifa->state = 0;
@@ -3621,8 +3650,6 @@ restart:
} else {
state = ifa->state;
ifa->state = INET6_IFADDR_STATE_DEAD;
-
- list_move(&ifa->if_list, &del_list);
}
spin_unlock_bh(&ifa->lock);
@@ -3727,12 +3754,21 @@ static void addrconf_dad_kick(struct inet6_ifaddr *ifp)
{
unsigned long rand_num;
struct inet6_dev *idev = ifp->idev;
+ u64 nonce;
if (ifp->flags & IFA_F_OPTIMISTIC)
rand_num = 0;
else
rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1);
+ nonce = 0;
+ if (idev->cnf.enhanced_dad ||
+ dev_net(idev->dev)->ipv6.devconf_all->enhanced_dad) {
+ do
+ get_random_bytes(&nonce, 6);
+ while (nonce == 0);
+ }
+ ifp->dad_nonce = nonce;
ifp->dad_probes = idev->cnf.dad_transmits;
addrconf_mod_dad_work(ifp, rand_num);
}
@@ -3910,7 +3946,8 @@ static void addrconf_dad_work(struct work_struct *w)
/* send a neighbour solicitation for our addr */
addrconf_addr_solict_mult(&ifp->addr, &mcaddr);
- ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any);
+ ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any,
+ ifp->dad_nonce);
out:
in6_ifa_put(ifp);
rtnl_unlock();
@@ -3989,6 +4026,12 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id)
if (bump_id)
rt_genid_bump_ipv6(dev_net(dev));
+
+ /* Make sure that a new temporary address will be created
+ * before this temporary address becomes deprecated.
+ */
+ if (ifp->flags & IFA_F_TEMPORARY)
+ addrconf_verify_rtnl();
}
static void addrconf_dad_run(struct inet6_dev *idev)
@@ -4868,6 +4911,13 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
struct net *net = dev_net(ifa->idev->dev);
int err = -ENOBUFS;
+ /* Don't send DELADDR notification for TENTATIVE address,
+ * since NEWADDR notification is sent only after removing
+ * TENTATIVE flag.
+ */
+ if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR)
+ return;
+
skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
if (!skb)
goto errout;
@@ -4950,6 +5000,12 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down;
+ array[DEVCONF_SEG6_ENABLED] = cnf->seg6_enabled;
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac;
+#endif
+ array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad;
+ array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode;
}
static inline size_t inet6_ifla6_size(void)
@@ -5061,7 +5117,7 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
if (!nla)
goto nla_put_failure;
- if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->addr_gen_mode))
+ if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode))
goto nla_put_failure;
read_lock_bh(&idev->lock);
@@ -5179,6 +5235,26 @@ static int inet6_validate_link_af(const struct net_device *dev,
return nla_parse_nested(tb, IFLA_INET6_MAX, nla, inet6_af_policy);
}
+static int check_addr_gen_mode(int mode)
+{
+ if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
+ mode != IN6_ADDR_GEN_MODE_NONE &&
+ mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
+ mode != IN6_ADDR_GEN_MODE_RANDOM)
+ return -EINVAL;
+ return 1;
+}
+
+static int check_stable_privacy(struct inet6_dev *idev, struct net *net,
+ int mode)
+{
+ if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
+ !idev->cnf.stable_secret.initialized &&
+ !net->ipv6.devconf_dflt->stable_secret.initialized)
+ return -EINVAL;
+ return 1;
+}
+
static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
{
int err = -EINVAL;
@@ -5200,18 +5276,11 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);
- if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
- mode != IN6_ADDR_GEN_MODE_NONE &&
- mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
- mode != IN6_ADDR_GEN_MODE_RANDOM)
- return -EINVAL;
-
- if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
- !idev->cnf.stable_secret.initialized &&
- !dev_net(dev)->ipv6.devconf_dflt->stable_secret.initialized)
+ if (check_addr_gen_mode(mode) < 0 ||
+ check_stable_privacy(idev, dev_net(dev), mode) < 0)
return -EINVAL;
- idev->addr_gen_mode = mode;
+ idev->cnf.addr_gen_mode = mode;
err = 0;
}
@@ -5515,8 +5584,7 @@ static void addrconf_disable_change(struct net *net, __s32 newf)
struct net_device *dev;
struct inet6_dev *idev;
- rcu_read_lock();
- for_each_netdev_rcu(net, dev) {
+ for_each_netdev(net, dev) {
idev = __in6_dev_get(dev);
if (idev) {
int changed = (!idev->cnf.disable_ipv6) ^ (!newf);
@@ -5525,7 +5593,6 @@ static void addrconf_disable_change(struct net *net, __s32 newf)
dev_disable_change(idev);
}
}
- rcu_read_unlock();
}
static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf)
@@ -5620,6 +5687,55 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
return ret;
}
+static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = 0;
+ int new_val;
+ struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1;
+ struct net *net = (struct net *)ctl->extra2;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+ if (write) {
+ new_val = *((int *)ctl->data);
+
+ if (check_addr_gen_mode(new_val) < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* request for default */
+ if (&net->ipv6.devconf_dflt->addr_gen_mode == ctl->data) {
+ ipv6_devconf_dflt.addr_gen_mode = new_val;
+
+ /* request for individual net device */
+ } else {
+ if (!idev)
+ goto out;
+
+ if (check_stable_privacy(idev, net, new_val) < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (idev->cnf.addr_gen_mode != new_val) {
+ idev->cnf.addr_gen_mode = new_val;
+ addrconf_dev_config(idev->dev);
+ }
+ }
+ }
+
+out:
+ rtnl_unlock();
+
+ return ret;
+}
+
static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -5670,14 +5786,14 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
struct inet6_dev *idev = __in6_dev_get(dev);
if (idev) {
- idev->addr_gen_mode =
+ idev->cnf.addr_gen_mode =
IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
}
}
} else {
struct inet6_dev *idev = ctl->extra1;
- idev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+ idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
}
out:
@@ -6042,6 +6158,36 @@ static const struct ctl_table addrconf_sysctl[] = {
},
{
+ .procname = "seg6_enabled",
+ .data = &ipv6_devconf.seg6_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ {
+ .procname = "seg6_require_hmac",
+ .data = &ipv6_devconf.seg6_require_hmac,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "enhanced_dad",
+ .data = &ipv6_devconf.enhanced_dad,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "addr_gen_mode",
+ .data = &ipv6_devconf.addr_gen_mode,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_addr_gen_mode,
+ },
+ {
/* sentinel */
}
};
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 46ad699937fd..a9a9553ee63d 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -61,8 +61,9 @@
#include <net/ip6_tunnel.h>
#endif
#include <net/calipso.h>
+#include <net/seg6.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/mroute6.h>
#include "ip6_offload.h"
@@ -257,6 +258,14 @@ lookup_protocol:
goto out;
}
}
+
+ if (!kern) {
+ err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+ if (err) {
+ sk_common_release(sk);
+ goto out;
+ }
+ }
out:
return err;
out_rcu_unlock:
@@ -293,7 +302,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
return -EINVAL;
snum = ntohs(addr->sin6_port);
- if (snum && snum < PROT_SOCK && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
+ if (snum && snum < inet_prot_sock(net) &&
+ !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
return -EACCES;
lock_sock(sk);
@@ -678,6 +688,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_dport = inet->inet_dport;
fl6.fl6_sport = inet->inet_sport;
+ fl6.flowi6_uid = sk->sk_uid;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
rcu_read_lock();
@@ -909,12 +920,12 @@ static int __init inet6_init(void)
err = register_pernet_subsys(&inet6_net_ops);
if (err)
goto register_pernet_fail;
- err = icmpv6_init();
- if (err)
- goto icmp_fail;
err = ip6_mr_init();
if (err)
goto ipmr_fail;
+ err = icmpv6_init();
+ if (err)
+ goto icmp_fail;
err = ndisc_init();
if (err)
goto ndisc_fail;
@@ -990,6 +1001,10 @@ static int __init inet6_init(void)
if (err)
goto calipso_fail;
+ err = seg6_init();
+ if (err)
+ goto seg6_fail;
+
#ifdef CONFIG_SYSCTL
err = ipv6_sysctl_register();
if (err)
@@ -1000,8 +1015,10 @@ out:
#ifdef CONFIG_SYSCTL
sysctl_fail:
- calipso_exit();
+ seg6_exit();
#endif
+seg6_fail:
+ calipso_exit();
calipso_fail:
pingv6_exit();
pingv6_fail:
@@ -1044,10 +1061,10 @@ igmp_fail:
ndisc_cleanup();
ndisc_fail:
ip6_mr_cleanup();
-ipmr_fail:
- icmpv6_cleanup();
icmp_fail:
unregister_pernet_subsys(&inet6_net_ops);
+ipmr_fail:
+ icmpv6_cleanup();
register_pernet_fail:
sock_unregister(PF_INET6);
rtnl_unregister_all(PF_INET6);
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 0630a4d5daaa..dda6035e3b84 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -474,6 +474,9 @@ static void ah6_input_done(struct crypto_async_request *base, int err)
int hdr_len = skb_network_header_len(skb);
int ah_hlen = (ah->hdrlen + 2) << 2;
+ if (err)
+ goto out;
+
work_iph = AH_SKB_CB(skb)->tmp;
auth_data = ah_tmp_auth(work_iph, hdr_len);
icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
@@ -662,9 +665,10 @@ static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index ccf40550c475..e011122ebd43 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -33,7 +33,7 @@
#include <net/dsfield.h>
#include <linux/errqueue.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
static bool ipv6_mapped_addr_any(const struct in6_addr *a)
{
@@ -54,6 +54,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk)
fl6->fl6_dport = inet->inet_dport;
fl6->fl6_sport = inet->inet_sport;
fl6->flowlabel = np->flow_label;
+ fl6->flowi6_uid = sk->sk_uid;
if (!fl6->flowi6_oif)
fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
@@ -166,18 +167,22 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
if (np->sndflow)
fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
- addr_type = ipv6_addr_type(&usin->sin6_addr);
-
- if (addr_type == IPV6_ADDR_ANY) {
+ if (ipv6_addr_any(&usin->sin6_addr)) {
/*
* connect to self
*/
- usin->sin6_addr.s6_addr[15] = 0x01;
+ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+ ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
+ &usin->sin6_addr);
+ else
+ usin->sin6_addr = in6addr_loopback;
}
+ addr_type = ipv6_addr_type(&usin->sin6_addr);
+
daddr = &usin->sin6_addr;
- if (addr_type == IPV6_ADDR_MAPPED) {
+ if (addr_type & IPV6_ADDR_MAPPED) {
struct sockaddr_in sin;
if (__ipv6_only_sock(sk)) {
@@ -400,9 +405,6 @@ static inline bool ipv6_datagram_support_addr(struct sock_exterr_skb *serr)
* At one point, excluding local errors was a quick test to identify icmp/icmp6
* errors. This is no longer true, but the test remained, so the v6 stack,
* unlike v4, also honors cmsg requests on all wifi and timestamp errors.
- *
- * Timestamp code paths do not initialize the fields expected by cmsg:
- * the PKTINFO fields in skb->cb[]. Fill those in here.
*/
static bool ip6_datagram_support_cmsg(struct sk_buff *skb,
struct sock_exterr_skb *serr)
@@ -414,14 +416,9 @@ static bool ip6_datagram_support_cmsg(struct sk_buff *skb,
if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL)
return false;
- if (!skb->dev)
+ if (!IP6CB(skb)->iif)
return false;
- if (skb->protocol == htons(ETH_P_IPV6))
- IP6CB(skb)->iif = skb->dev->ifindex;
- else
- PKTINFO_SKB_CB(skb)->ipi_ifindex = skb->dev->ifindex;
-
return true;
}
@@ -700,7 +697,7 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
struct sockaddr_in6 sin6;
__be16 *ports = (__be16 *) skb_transport_header(skb);
- if (skb_transport_offset(skb) + 4 <= skb->len) {
+ if (skb_transport_offset(skb) + 4 <= (int)skb->len) {
/* All current transport protocols have the port numbers in the
* first four bytes of the transport header and this function is
* written with this assumption in mind.
@@ -717,6 +714,11 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6);
}
}
+ if (np->rxopt.bits.recvfragsize && opt->frag_max_size) {
+ int val = opt->frag_max_size;
+
+ put_cmsg(msg, SOL_IPV6, IPV6_RECVFRAGSIZE, sizeof(val), &val);
+ }
}
void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 111ba55fd512..ff54faa75631 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -44,6 +44,8 @@
#include <net/protocol.h>
#include <linux/icmpv6.h>
+#include <linux/highmem.h>
+
struct esp_skb_cb {
struct xfrm_skb_cb xfrm;
void *tmp;
@@ -114,11 +116,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
__alignof__(struct scatterlist));
}
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
+{
+ __be32 *seqhi;
+ struct crypto_aead *aead = x->data;
+ int seqhilen = 0;
+ u8 *iv;
+ struct aead_request *req;
+ struct scatterlist *sg;
+
+ if (x->props.flags & XFRM_STATE_ESN)
+ seqhilen += sizeof(__be32);
+
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
+ req = esp_tmp_req(aead, iv);
+
+ /* Unref skb_frag_pages in the src scatterlist if necessary.
+ * Skip the first sg which comes from skb->data.
+ */
+ if (req->src != req->dst)
+ for (sg = sg_next(req->src); sg; sg = sg_next(sg))
+ put_page(sg_page(sg));
+}
+
static void esp_output_done(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
+ void *tmp;
+ struct dst_entry *dst = skb_dst(skb);
+ struct xfrm_state *x = dst->xfrm;
- kfree(ESP_SKB_CB(skb)->tmp);
+ tmp = ESP_SKB_CB(skb)->tmp;
+ esp_ssg_unref(x, tmp);
+ kfree(tmp);
xfrm_output_resume(skb, err);
}
@@ -138,6 +169,27 @@ static void esp_output_restore_header(struct sk_buff *skb)
esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32));
}
+static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb,
+ struct ip_esp_hdr *esph,
+ __be32 *seqhi)
+{
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+
+ /* For ESN we move the header forward by 4 bytes to
+ * accomodate the high bits. We will move it back after
+ * encryption.
+ */
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
+ *seqhi = esph->spi;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ }
+
+ esph->spi = x->id.spi;
+
+ return esph;
+}
+
static void esp_output_done_esn(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
@@ -146,14 +198,31 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
esp_output_done(base, err);
}
+static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
+{
+ /* Fill padding... */
+ if (tfclen) {
+ memset(tail, 0, tfclen);
+ tail += tfclen;
+ }
+ do {
+ int i;
+ for (i = 0; i < plen - 2; i++)
+ tail[i] = i + 1;
+ } while (0);
+ tail[plen - 2] = plen - 2;
+ tail[plen - 1] = proto;
+}
+
static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
{
int err;
struct ip_esp_hdr *esph;
struct crypto_aead *aead;
struct aead_request *req;
- struct scatterlist *sg;
+ struct scatterlist *sg, *dsg;
struct sk_buff *trailer;
+ struct page *page;
void *tmp;
int blksize;
int clen;
@@ -164,10 +233,13 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
int nfrags;
int assoclen;
int seqhilen;
+ int tailen;
u8 *iv;
u8 *tail;
+ u8 *vaddr;
__be32 *seqhi;
__be64 seqno;
+ __u8 proto = *skb_mac_header(skb);
/* skb is pure payload to encrypt */
aead = x->data;
@@ -186,11 +258,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
clen = ALIGN(skb->len + 2 + tfclen, blksize);
plen = clen - skb->len - tfclen;
-
- err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
- if (err < 0)
- goto error;
- nfrags = err;
+ tailen = tfclen + plen + alen;
assoclen = sizeof(*esph);
seqhilen = 0;
@@ -200,59 +268,152 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
assoclen += seqhilen;
}
- tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
- if (!tmp) {
- err = -ENOMEM;
- goto error;
+ *skb_mac_header(skb) = IPPROTO_ESP;
+ esph = ip_esp_hdr(skb);
+
+ if (!skb_cloned(skb)) {
+ if (tailen <= skb_availroom(skb)) {
+ nfrags = 1;
+ trailer = skb;
+ tail = skb_tail_pointer(trailer);
+
+ goto skip_cow;
+ } else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
+ && !skb_has_frag_list(skb)) {
+ int allocsize;
+ struct sock *sk = skb->sk;
+ struct page_frag *pfrag = &x->xfrag;
+
+ allocsize = ALIGN(tailen, L1_CACHE_BYTES);
+
+ spin_lock_bh(&x->lock);
+
+ if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+ spin_unlock_bh(&x->lock);
+ goto cow;
+ }
+
+ page = pfrag->page;
+ get_page(page);
+
+ vaddr = kmap_atomic(page);
+
+ tail = vaddr + pfrag->offset;
+
+ esp_output_fill_trailer(tail, tfclen, plen, proto);
+
+ kunmap_atomic(vaddr);
+
+ nfrags = skb_shinfo(skb)->nr_frags;
+
+ __skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
+ tailen);
+ skb_shinfo(skb)->nr_frags = ++nfrags;
+
+ pfrag->offset = pfrag->offset + allocsize;
+ nfrags++;
+
+ skb->len += tailen;
+ skb->data_len += tailen;
+ skb->truesize += tailen;
+ if (sk)
+ atomic_add(tailen, &sk->sk_wmem_alloc);
+
+ skb_push(skb, -skb_network_offset(skb));
+
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+ esph->spi = x->id.spi;
+
+ tmp = esp_alloc_tmp(aead, nfrags + 2, seqhilen);
+ if (!tmp) {
+ spin_unlock_bh(&x->lock);
+ err = -ENOMEM;
+ goto error;
+ }
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
+ req = esp_tmp_req(aead, iv);
+ sg = esp_req_sg(aead, req);
+ dsg = &sg[nfrags];
+
+ esph = esp_output_set_esn(skb, esph, seqhi);
+
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg,
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + clen + alen);
+
+ allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
+
+ if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+ spin_unlock_bh(&x->lock);
+ err = -ENOMEM;
+ goto error;
+ }
+
+ skb_shinfo(skb)->nr_frags = 1;
+
+ page = pfrag->page;
+ get_page(page);
+ /* replace page frags in skb with new page */
+ __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
+ pfrag->offset = pfrag->offset + allocsize;
+
+ sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
+ skb_to_sgvec(skb, dsg,
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + clen + alen);
+
+ spin_unlock_bh(&x->lock);
+
+ goto skip_cow2;
+ }
}
- seqhi = esp_tmp_seqhi(tmp);
- iv = esp_tmp_iv(aead, tmp, seqhilen);
- req = esp_tmp_req(aead, iv);
- sg = esp_req_sg(aead, req);
+cow:
+ err = skb_cow_data(skb, tailen, &trailer);
+ if (err < 0)
+ goto error;
+ nfrags = err;
- /* Fill padding... */
tail = skb_tail_pointer(trailer);
- if (tfclen) {
- memset(tail, 0, tfclen);
- tail += tfclen;
- }
- do {
- int i;
- for (i = 0; i < plen - 2; i++)
- tail[i] = i + 1;
- } while (0);
- tail[plen - 2] = plen - 2;
- tail[plen - 1] = *skb_mac_header(skb);
- pskb_put(skb, trailer, clen - skb->len + alen);
+ esph = ip_esp_hdr(skb);
+skip_cow:
+ esp_output_fill_trailer(tail, tfclen, plen, proto);
+
+ pskb_put(skb, trailer, clen - skb->len + alen);
skb_push(skb, -skb_network_offset(skb));
- esph = ip_esp_hdr(skb);
- *skb_mac_header(skb) = IPPROTO_ESP;
esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+ esph->spi = x->id.spi;
- aead_request_set_callback(req, 0, esp_output_done, skb);
-
- /* For ESN we move the header forward by 4 bytes to
- * accomodate the high bits. We will move it back after
- * encryption.
- */
- if ((x->props.flags & XFRM_STATE_ESN)) {
- esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
- *seqhi = esph->spi;
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
- aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+ tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
+ if (!tmp) {
+ err = -ENOMEM;
+ goto error;
}
- esph->spi = x->id.spi;
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
+ req = esp_tmp_req(aead, iv);
+ sg = esp_req_sg(aead, req);
+ dsg = sg;
+
+ esph = esp_output_set_esn(skb, esph, seqhi);
sg_init_table(sg, nfrags);
skb_to_sgvec(skb, sg,
(unsigned char *)esph - skb->data,
assoclen + ivlen + clen + alen);
- aead_request_set_crypt(req, sg, sg, ivlen + clen, iv);
+skip_cow2:
+ if ((x->props.flags & XFRM_STATE_ESN))
+ aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+ else
+ aead_request_set_callback(req, 0, esp_output_done, skb);
+
+ aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv);
aead_request_set_ad(req, assoclen);
seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
@@ -278,6 +439,8 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
esp_output_restore_header(skb);
}
+ if (sg != dsg)
+ esp_ssg_unref(x, tmp);
kfree(tmp);
error:
@@ -343,6 +506,23 @@ static void esp_input_restore_header(struct sk_buff *skb)
__skb_pull(skb, 4);
}
+static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
+{
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data;
+
+ /* For ESN we move the header forward by 4 bytes to
+ * accomodate the high bits. We will move it back after
+ * decryption.
+ */
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ esph = (void *)skb_push(skb, 4);
+ *seqhi = esph->spi;
+ esph->spi = esph->seq_no;
+ esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+ }
+}
+
static void esp_input_done_esn(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
@@ -378,14 +558,6 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
goto out;
}
- nfrags = skb_cow_data(skb, 0, &trailer);
- if (nfrags < 0) {
- ret = -EINVAL;
- goto out;
- }
-
- ret = -ENOMEM;
-
assoclen = sizeof(*esph);
seqhilen = 0;
@@ -394,6 +566,27 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
assoclen += seqhilen;
}
+ if (!skb_cloned(skb)) {
+ if (!skb_is_nonlinear(skb)) {
+ nfrags = 1;
+
+ goto skip_cow;
+ } else if (!skb_has_frag_list(skb)) {
+ nfrags = skb_shinfo(skb)->nr_frags;
+ nfrags++;
+
+ goto skip_cow;
+ }
+ }
+
+ nfrags = skb_cow_data(skb, 0, &trailer);
+ if (nfrags < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+skip_cow:
+ ret = -ENOMEM;
tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
if (!tmp)
goto out;
@@ -404,26 +597,17 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
req = esp_tmp_req(aead, iv);
sg = esp_req_sg(aead, req);
- skb->ip_summed = CHECKSUM_NONE;
+ esp_input_set_header(skb, seqhi);
- esph = (struct ip_esp_hdr *)skb->data;
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg, 0, skb->len);
- aead_request_set_callback(req, 0, esp_input_done, skb);
+ skb->ip_summed = CHECKSUM_NONE;
- /* For ESN we move the header forward by 4 bytes to
- * accomodate the high bits. We will move it back after
- * decryption.
- */
- if ((x->props.flags & XFRM_STATE_ESN)) {
- esph = (void *)skb_push(skb, 4);
- *seqhi = esph->spi;
- esph->spi = esph->seq_no;
- esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+ if ((x->props.flags & XFRM_STATE_ESN))
aead_request_set_callback(req, 0, esp_input_done_esn, skb);
- }
-
- sg_init_table(sg, nfrags);
- skb_to_sgvec(skb, sg, 0, skb->len);
+ else
+ aead_request_set_callback(req, 0, esp_input_done, skb);
aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
aead_request_set_ad(req, assoclen);
@@ -474,9 +658,10 @@ static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
new file mode 100644
index 000000000000..d914eb93204a
--- /dev/null
+++ b/net/ipv6/esp6_offload.c
@@ -0,0 +1,108 @@
+/*
+ * IPV6 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * Copyright (C) 2016 secunet Security Networks AG
+ * Author: Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * ESP GRO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/ip6_route.h>
+#include <net/ipv6.h>
+#include <linux/icmpv6.h>
+
+static struct sk_buff **esp6_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ int offset = skb_gro_offset(skb);
+ struct xfrm_offload *xo;
+ struct xfrm_state *x;
+ __be32 seq;
+ __be32 spi;
+ int err;
+
+ skb_pull(skb, offset);
+
+ if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0)
+ goto out;
+
+ err = secpath_set(skb);
+ if (err)
+ goto out;
+
+ if (skb->sp->len == XFRM_MAX_DEPTH)
+ goto out;
+
+ x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
+ (xfrm_address_t *)&ipv6_hdr(skb)->daddr,
+ spi, IPPROTO_ESP, AF_INET6);
+ if (!x)
+ goto out;
+
+ skb->sp->xvec[skb->sp->len++] = x;
+ skb->sp->olen++;
+
+ xo = xfrm_offload(skb);
+ if (!xo) {
+ xfrm_state_put(x);
+ goto out;
+ }
+ xo->flags |= XFRM_GRO;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
+ XFRM_SPI_SKB_CB(skb)->seq = seq;
+
+ /* We don't need to handle errors from xfrm_input, it does all
+ * the error handling and frees the resources on error. */
+ xfrm_input(skb, IPPROTO_ESP, spi, -2);
+
+ return ERR_PTR(-EINPROGRESS);
+out:
+ skb_push(skb, offset);
+ NAPI_GRO_CB(skb)->same_flow = 0;
+ NAPI_GRO_CB(skb)->flush = 1;
+
+ return NULL;
+}
+
+static const struct net_offload esp6_offload = {
+ .callbacks = {
+ .gro_receive = esp6_gro_receive,
+ },
+};
+
+static int __init esp6_offload_init(void)
+{
+ return inet6_add_offload(&esp6_offload, IPPROTO_ESP);
+}
+
+static void __exit esp6_offload_exit(void)
+{
+ inet6_del_offload(&esp6_offload, IPPROTO_ESP);
+}
+
+module_init(esp6_offload_init);
+module_exit(esp6_offload_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 139ceb68bd37..25192a3b0cd7 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -47,6 +47,11 @@
#if IS_ENABLED(CONFIG_IPV6_MIP6)
#include <net/xfrm.h>
#endif
+#include <linux/seg6.h>
+#include <net/seg6.h>
+#ifdef CONFIG_IPV6_SEG6_HMAC
+#include <net/seg6_hmac.h>
+#endif
#include <linux/uaccess.h>
@@ -227,7 +232,7 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff)
ipv6h->saddr = hao->addr;
hao->addr = tmp_addr;
- if (skb->tstamp.tv64 == 0)
+ if (skb->tstamp == 0)
__net_timestamp(skb);
return true;
@@ -286,6 +291,156 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
return -1;
}
+static void seg6_update_csum(struct sk_buff *skb)
+{
+ struct ipv6_sr_hdr *hdr;
+ struct in6_addr *addr;
+ __be32 from, to;
+
+ /* srh is at transport offset and seg_left is already decremented
+ * but daddr is not yet updated with next segment
+ */
+
+ hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+ addr = hdr->segments + hdr->segments_left;
+
+ hdr->segments_left++;
+ from = *(__be32 *)hdr;
+
+ hdr->segments_left--;
+ to = *(__be32 *)hdr;
+
+ /* update skb csum with diff resulting from seg_left decrement */
+
+ update_csum_diff4(skb, from, to);
+
+ /* compute csum diff between current and next segment and update */
+
+ update_csum_diff16(skb, (__be32 *)(&ipv6_hdr(skb)->daddr),
+ (__be32 *)addr);
+}
+
+static int ipv6_srh_rcv(struct sk_buff *skb)
+{
+ struct inet6_skb_parm *opt = IP6CB(skb);
+ struct net *net = dev_net(skb->dev);
+ struct ipv6_sr_hdr *hdr;
+ struct inet6_dev *idev;
+ struct in6_addr *addr;
+ int accept_seg6;
+
+ hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+
+ idev = __in6_dev_get(skb->dev);
+
+ accept_seg6 = net->ipv6.devconf_all->seg6_enabled;
+ if (accept_seg6 > idev->cnf.seg6_enabled)
+ accept_seg6 = idev->cnf.seg6_enabled;
+
+ if (!accept_seg6) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (!seg6_hmac_validate_skb(skb)) {
+ kfree_skb(skb);
+ return -1;
+ }
+#endif
+
+looped_back:
+ if (hdr->segments_left == 0) {
+ if (hdr->nexthdr == NEXTHDR_IPV6) {
+ int offset = (hdr->hdrlen + 1) << 3;
+
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ skb_network_header_len(skb));
+
+ if (!pskb_pull(skb, offset)) {
+ kfree_skb(skb);
+ return -1;
+ }
+ skb_postpull_rcsum(skb, skb_transport_header(skb),
+ offset);
+
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb->encapsulation = 0;
+
+ __skb_tunnel_rx(skb, skb->dev, net);
+
+ netif_rx(skb);
+ return -1;
+ }
+
+ opt->srcrt = skb_network_header_len(skb);
+ opt->lastopt = opt->srcrt;
+ skb->transport_header += (hdr->hdrlen + 1) << 3;
+ opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);
+
+ return 1;
+ }
+
+ if (hdr->segments_left >= (hdr->hdrlen >> 1)) {
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+ ((&hdr->segments_left) -
+ skb_network_header(skb)));
+ return -1;
+ }
+
+ if (skb_cloned(skb)) {
+ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTDISCARDS);
+ kfree_skb(skb);
+ return -1;
+ }
+ }
+
+ hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+
+ hdr->segments_left--;
+ addr = hdr->segments + hdr->segments_left;
+
+ skb_push(skb, sizeof(struct ipv6hdr));
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ seg6_update_csum(skb);
+
+ ipv6_hdr(skb)->daddr = *addr;
+
+ skb_dst_drop(skb);
+
+ ip6_route_input(skb);
+
+ if (skb_dst(skb)->error) {
+ dst_input(skb);
+ return -1;
+ }
+
+ if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
+ if (ipv6_hdr(skb)->hop_limit <= 1) {
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
+ icmpv6_send(skb, ICMPV6_TIME_EXCEED,
+ ICMPV6_EXC_HOPLIMIT, 0);
+ kfree_skb(skb);
+ return -1;
+ }
+ ipv6_hdr(skb)->hop_limit--;
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ goto looped_back;
+ }
+
+ dst_input(skb);
+
+ return -1;
+}
+
/********************************
Routing header.
********************************/
@@ -326,6 +481,10 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
return -1;
}
+ /* segment routing */
+ if (hdr->type == IPV6_SRCRT_TYPE_4)
+ return ipv6_srh_rcv(skb);
+
looped_back:
if (hdr->segments_left == 0) {
switch (hdr->type) {
@@ -679,9 +838,9 @@ int ipv6_parse_hopopts(struct sk_buff *skb)
* for headers.
*/
-static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
- struct ipv6_rt_hdr *opt,
- struct in6_addr **addr_p)
+static void ipv6_push_rthdr0(struct sk_buff *skb, u8 *proto,
+ struct ipv6_rt_hdr *opt,
+ struct in6_addr **addr_p, struct in6_addr *saddr)
{
struct rt0_hdr *phdr, *ihdr;
int hops;
@@ -704,6 +863,62 @@ static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
*proto = NEXTHDR_ROUTING;
}
+static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto,
+ struct ipv6_rt_hdr *opt,
+ struct in6_addr **addr_p, struct in6_addr *saddr)
+{
+ struct ipv6_sr_hdr *sr_phdr, *sr_ihdr;
+ int plen, hops;
+
+ sr_ihdr = (struct ipv6_sr_hdr *)opt;
+ plen = (sr_ihdr->hdrlen + 1) << 3;
+
+ sr_phdr = (struct ipv6_sr_hdr *)skb_push(skb, plen);
+ memcpy(sr_phdr, sr_ihdr, sizeof(struct ipv6_sr_hdr));
+
+ hops = sr_ihdr->first_segment + 1;
+ memcpy(sr_phdr->segments + 1, sr_ihdr->segments + 1,
+ (hops - 1) * sizeof(struct in6_addr));
+
+ sr_phdr->segments[0] = **addr_p;
+ *addr_p = &sr_ihdr->segments[hops - 1];
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (sr_has_hmac(sr_phdr)) {
+ struct net *net = NULL;
+
+ if (skb->dev)
+ net = dev_net(skb->dev);
+ else if (skb->sk)
+ net = sock_net(skb->sk);
+
+ WARN_ON(!net);
+
+ if (net)
+ seg6_push_hmac(net, saddr, sr_phdr);
+ }
+#endif
+
+ sr_phdr->nexthdr = *proto;
+ *proto = NEXTHDR_ROUTING;
+}
+
+static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
+ struct ipv6_rt_hdr *opt,
+ struct in6_addr **addr_p, struct in6_addr *saddr)
+{
+ switch (opt->type) {
+ case IPV6_SRCRT_TYPE_0:
+ ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr);
+ break;
+ case IPV6_SRCRT_TYPE_4:
+ ipv6_push_rthdr4(skb, proto, opt, addr_p, saddr);
+ break;
+ default:
+ break;
+ }
+}
+
static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt)
{
struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, ipv6_optlen(opt));
@@ -715,10 +930,10 @@ static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv
void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
u8 *proto,
- struct in6_addr **daddr)
+ struct in6_addr **daddr, struct in6_addr *saddr)
{
if (opt->srcrt) {
- ipv6_push_rthdr(skb, proto, opt->srcrt, daddr);
+ ipv6_push_rthdr(skb, proto, opt->srcrt, daddr, saddr);
/*
* IPV6_RTHDRDSTOPTS is ignored
* unless IPV6_RTHDR is set (RFC3542).
@@ -945,7 +1160,22 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
return NULL;
*orig = fl6->daddr;
- fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr;
+
+ switch (opt->srcrt->type) {
+ case IPV6_SRCRT_TYPE_0:
+ fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr;
+ break;
+ case IPV6_SRCRT_TYPE_4:
+ {
+ struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt;
+
+ fl6->daddr = srh->segments[srh->first_segment];
+ break;
+ }
+ default:
+ return NULL;
+ }
+
return orig;
}
EXPORT_SYMBOL_GPL(fl6_update_dst);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 2772004ba5a1..230b5aac9f03 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -70,7 +70,7 @@
#include <net/dsfield.h>
#include <net/l3mdev.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* The ICMP socket(s). This is the most convenient way to flow control
@@ -92,9 +92,10 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
struct net *net = dev_net(skb->dev);
if (type == ICMPV6_PKT_TOOBIG)
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
else if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
if (!(type & ICMPV6_INFOMSG_MASK))
if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
@@ -109,19 +110,17 @@ static const struct inet6_protocol icmpv6_protocol = {
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
};
+/* Called with BH disabled */
static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
{
struct sock *sk;
- local_bh_disable();
-
sk = icmpv6_sk(net);
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path (f.e. SIT or
* ip6ip6 tunnel) signals dst_link_failure() for an
* outgoing ICMP6 packet.
*/
- local_bh_enable();
return NULL;
}
return sk;
@@ -129,7 +128,7 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
{
- spin_unlock_bh(&sk->sk_lock.slock);
+ spin_unlock(&sk->sk_lock.slock);
}
/*
@@ -167,6 +166,30 @@ static bool is_ineligible(const struct sk_buff *skb)
return false;
}
+static bool icmpv6_mask_allow(int type)
+{
+ /* Informational messages are not limited. */
+ if (type & ICMPV6_INFOMSG_MASK)
+ return true;
+
+ /* Do not limit pmtu discovery, it would break it. */
+ if (type == ICMPV6_PKT_TOOBIG)
+ return true;
+
+ return false;
+}
+
+static bool icmpv6_global_allow(int type)
+{
+ if (icmpv6_mask_allow(type))
+ return true;
+
+ if (icmp_global_allow())
+ return true;
+
+ return false;
+}
+
/*
* Check the ICMP output rate limit
*/
@@ -177,12 +200,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
struct dst_entry *dst;
bool res = false;
- /* Informational messages are not limited. */
- if (type & ICMPV6_INFOMSG_MASK)
- return true;
-
- /* Do not limit pmtu discovery, it would break it. */
- if (type == ICMPV6_PKT_TOOBIG)
+ if (icmpv6_mask_allow(type))
return true;
/*
@@ -199,20 +217,16 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
} else {
struct rt6_info *rt = (struct rt6_info *)dst;
int tmo = net->ipv6.sysctl.icmpv6_time;
+ struct inet_peer *peer;
/* Give more bandwidth to wider prefixes. */
if (rt->rt6i_dst.plen < 128)
tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
- if (icmp_global_allow()) {
- struct inet_peer *peer;
-
- peer = inet_getpeer_v6(net->ipv6.peers,
- &fl6->daddr, 1);
- res = inet_peer_xrlim_allow(peer, tmo);
- if (peer)
- inet_putpeer(peer);
- }
+ peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1);
+ res = inet_peer_xrlim_allow(peer, tmo);
+ if (peer)
+ inet_putpeer(peer);
}
dst_release(dst);
return res;
@@ -473,6 +487,13 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
return;
}
+ /* Needed by both icmp_global_allow and icmpv6_xmit_lock */
+ local_bh_disable();
+
+ /* Check global sysctl_icmp_msgs_per_sec ratelimit */
+ if (!icmpv6_global_allow(type))
+ goto out_bh_enable;
+
mip6_addr_swap(skb);
memset(&fl6, 0, sizeof(fl6));
@@ -486,11 +507,13 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
fl6.flowi6_oif = iif;
fl6.fl6_icmp_type = type;
fl6.fl6_icmp_code = code;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
sk = icmpv6_xmit_lock(net);
if (!sk)
- return;
+ goto out_bh_enable;
+
sk->sk_mark = mark;
np = inet6_sk(sk);
@@ -550,6 +573,8 @@ out_dst_release:
dst_release(dst);
out:
icmpv6_xmit_unlock(sk);
+out_bh_enable:
+ local_bh_enable();
}
/* Slightly more convenient version of icmp6_send.
@@ -660,11 +685,13 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
fl6.flowi6_oif = skb->dev->ifindex;
fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
fl6.flowi6_mark = mark;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+ local_bh_disable();
sk = icmpv6_xmit_lock(net);
if (!sk)
- return;
+ goto out_bh_enable;
sk->sk_mark = mark;
np = inet6_sk(sk);
@@ -706,6 +733,8 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
dst_release(dst);
out:
icmpv6_xmit_unlock(sk);
+out_bh_enable:
+ local_bh_enable();
}
void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index e50c27a93e17..ce1aae4a7fc8 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -6,29 +6,88 @@
#include <linux/socket.h>
#include <linux/types.h>
#include <net/checksum.h>
+#include <net/dst_cache.h>
#include <net/ip.h>
#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
#include <net/lwtunnel.h>
#include <net/protocol.h>
#include <uapi/linux/ila.h>
#include "ila.h"
+struct ila_lwt {
+ struct ila_params p;
+ struct dst_cache dst_cache;
+ u32 connected : 1;
+};
+
+static inline struct ila_lwt *ila_lwt_lwtunnel(
+ struct lwtunnel_state *lwt)
+{
+ return (struct ila_lwt *)lwt->data;
+}
+
static inline struct ila_params *ila_params_lwtunnel(
- struct lwtunnel_state *lwstate)
+ struct lwtunnel_state *lwt)
{
- return (struct ila_params *)lwstate->data;
+ return &ila_lwt_lwtunnel(lwt)->p;
}
static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct rt6_info *rt = (struct rt6_info *)orig_dst;
+ struct ila_lwt *ilwt = ila_lwt_lwtunnel(orig_dst->lwtstate);
+ struct dst_entry *dst;
+ int err = -EINVAL;
if (skb->protocol != htons(ETH_P_IPV6))
goto drop;
- ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), true);
+ ila_update_ipv6_locator(skb, ila_params_lwtunnel(orig_dst->lwtstate),
+ true);
- return dst->lwtstate->orig_output(net, sk, skb);
+ if (rt->rt6i_flags & (RTF_GATEWAY | RTF_CACHE)) {
+ /* Already have a next hop address in route, no need for
+ * dest cache route.
+ */
+ return orig_dst->lwtstate->orig_output(net, sk, skb);
+ }
+
+ dst = dst_cache_get(&ilwt->dst_cache);
+ if (unlikely(!dst)) {
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ struct flowi6 fl6;
+
+ /* Lookup a route for the new destination. Take into
+ * account that the base route may already have a gateway.
+ */
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_oif = orig_dst->dev->ifindex;
+ fl6.flowi6_iif = LOOPBACK_IFINDEX;
+ fl6.daddr = *rt6_nexthop((struct rt6_info *)orig_dst,
+ &ip6h->daddr);
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ err = -EHOSTUNREACH;
+ dst_release(dst);
+ goto drop;
+ }
+
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ goto drop;
+ }
+
+ if (ilwt->connected)
+ dst_cache_set_ip6(&ilwt->dst_cache, dst, &fl6.saddr);
+ }
+
+ skb_dst_set(skb, dst);
+ return dst_output(net, sk, skb);
drop:
kfree_skb(skb);
@@ -56,13 +115,13 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
[ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
};
-static int ila_build_state(struct net_device *dev, struct nlattr *nla,
+static int ila_build_state(struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts)
{
+ struct ila_lwt *ilwt;
struct ila_params *p;
struct nlattr *tb[ILA_ATTR_MAX + 1];
- size_t encap_len = sizeof(*p);
struct lwtunnel_state *newts;
const struct fib6_config *cfg6 = cfg;
struct ila_addr *iaddr;
@@ -71,7 +130,7 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla,
if (family != AF_INET6)
return -EINVAL;
- if (cfg6->fc_dst_len < sizeof(struct ila_locator) + 1) {
+ if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) {
/* Need to have full locator and at least type field
* included in destination
*/
@@ -95,11 +154,17 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla,
if (!tb[ILA_ATTR_LOCATOR])
return -EINVAL;
- newts = lwtunnel_state_alloc(encap_len);
+ newts = lwtunnel_state_alloc(sizeof(*ilwt));
if (!newts)
return -ENOMEM;
- newts->len = encap_len;
+ ilwt = ila_lwt_lwtunnel(newts);
+ ret = dst_cache_init(&ilwt->dst_cache, GFP_ATOMIC);
+ if (ret) {
+ kfree(newts);
+ return ret;
+ }
+
p = ila_params_lwtunnel(newts);
p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]);
@@ -120,11 +185,19 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla,
newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
LWTUNNEL_STATE_INPUT_REDIRECT;
+ if (cfg6->fc_dst_len == 8 * sizeof(struct in6_addr))
+ ilwt->connected = 1;
+
*ts = newts;
return 0;
}
+static void ila_destroy_state(struct lwtunnel_state *lwt)
+{
+ dst_cache_destroy(&ila_lwt_lwtunnel(lwt)->dst_cache);
+}
+
static int ila_fill_encap_info(struct sk_buff *skb,
struct lwtunnel_state *lwtstate)
{
@@ -159,11 +232,13 @@ static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
static const struct lwtunnel_encap_ops ila_encap_ops = {
.build_state = ila_build_state,
+ .destroy_state = ila_destroy_state,
.output = ila_output,
.input = ila_input,
.fill_encap = ila_fill_encap_info,
.get_encap_size = ila_encap_nlsize,
.cmp_encap = ila_encap_cmp,
+ .owner = THIS_MODULE,
};
int ila_lwt_init(void)
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index e604013dd814..af8f52ee7180 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -118,15 +118,7 @@ static const struct rhashtable_params rht_params = {
.obj_cmpfn = ila_cmpfn,
};
-static struct genl_family ila_nl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = ILA_GENL_NAME,
- .version = ILA_GENL_VERSION,
- .maxattr = ILA_ATTR_MAX,
- .netnsok = true,
- .parallel_ops = true,
-};
+static struct genl_family ila_nl_family;
static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
[ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
@@ -482,7 +474,15 @@ static int ila_nl_dump_start(struct netlink_callback *cb)
{
struct net *net = sock_net(cb->skb->sk);
struct ila_net *ilan = net_generic(net, ila_net_id);
- struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args;
+ struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
+
+ if (!iter) {
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ cb->args[0] = (long)iter;
+ }
return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter,
GFP_KERNEL);
@@ -490,16 +490,18 @@ static int ila_nl_dump_start(struct netlink_callback *cb)
static int ila_nl_dump_done(struct netlink_callback *cb)
{
- struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args;
+ struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
rhashtable_walk_exit(&iter->rhiter);
+ kfree(iter);
+
return 0;
}
static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args;
+ struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
struct rhashtable_iter *rhiter = &iter->rhiter;
struct ila_map *ila;
int ret;
@@ -561,6 +563,18 @@ static const struct genl_ops ila_nl_ops[] = {
},
};
+static struct genl_family ila_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = ILA_GENL_NAME,
+ .version = ILA_GENL_VERSION,
+ .maxattr = ILA_ATTR_MAX,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .ops = ila_nl_ops,
+ .n_ops = ARRAY_SIZE(ila_nl_ops),
+};
+
#define ILA_HASH_TABLE_SIZE 1024
static __net_init int ila_init_net(struct net *net)
@@ -623,7 +637,7 @@ static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral)
return 0;
}
-int ila_xlat_init(void)
+int __init ila_xlat_init(void)
{
int ret;
@@ -631,8 +645,7 @@ int ila_xlat_init(void)
if (ret)
goto exit;
- ret = genl_register_family_with_ops(&ila_nl_family,
- ila_nl_ops);
+ ret = genl_register_family(&ila_nl_family);
if (ret < 0)
goto unregister;
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 532c3ef282c5..9a31d13bf180 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -28,45 +28,6 @@
#include <net/inet6_connection_sock.h>
#include <net/sock_reuseport.h>
-int inet6_csk_bind_conflict(const struct sock *sk,
- const struct inet_bind_bucket *tb, bool relax)
-{
- const struct sock *sk2;
- int reuse = sk->sk_reuse;
- int reuseport = sk->sk_reuseport;
- kuid_t uid = sock_i_uid((struct sock *)sk);
-
- /* We must walk the whole port owner list in this case. -DaveM */
- /*
- * See comment in inet_csk_bind_conflict about sock lookup
- * vs net namespaces issues.
- */
- sk_for_each_bound(sk2, &tb->owners) {
- if (sk != sk2 &&
- (!sk->sk_bound_dev_if ||
- !sk2->sk_bound_dev_if ||
- sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
- if ((!reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) &&
- (!reuseport || !sk2->sk_reuseport ||
- rcu_access_pointer(sk->sk_reuseport_cb) ||
- (sk2->sk_state != TCP_TIME_WAIT &&
- !uid_eq(uid,
- sock_i_uid((struct sock *)sk2))))) {
- if (ipv6_rcv_saddr_equal(sk, sk2, true))
- break;
- }
- if (!relax && reuse && sk2->sk_reuse &&
- sk2->sk_state != TCP_LISTEN &&
- ipv6_rcv_saddr_equal(sk, sk2, true))
- break;
- }
- }
-
- return sk2 != NULL;
-}
-EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict);
-
struct dst_entry *inet6_csk_route_req(const struct sock *sk,
struct flowi6 *fl6,
const struct request_sock *req,
@@ -88,6 +49,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
fl6->flowi6_mark = ireq->ir_mark;
fl6->fl6_dport = ireq->ir_rmt_port;
fl6->fl6_sport = htons(ireq->ir_num);
+ fl6->flowi6_uid = sk->sk_uid;
security_req_classify_flow(req, flowi6_to_flowi(fl6));
dst = ip6_dst_lookup_flow(sk, fl6, final_p);
@@ -136,6 +98,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
fl6->flowi6_mark = sk->sk_mark;
fl6->fl6_sport = inet->inet_sport;
fl6->fl6_dport = inet->inet_dport;
+ fl6->flowi6_uid = sk->sk_uid;
security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
rcu_read_lock();
@@ -173,7 +136,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused
/* Restore final destination back after routing done */
fl6.daddr = sk->sk_v6_daddr;
- res = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt),
+ res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt),
np->tclass);
rcu_read_unlock();
return res;
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 02761c9fe43e..d0900918a19e 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -268,54 +268,10 @@ int inet6_hash(struct sock *sk)
if (sk->sk_state != TCP_CLOSE) {
local_bh_disable();
- err = __inet_hash(sk, NULL, ipv6_rcv_saddr_equal);
+ err = __inet_hash(sk, NULL);
local_bh_enable();
}
return err;
}
EXPORT_SYMBOL_GPL(inet6_hash);
-
-/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
- * only, and any IPv4 addresses if not IPv6 only
- * match_wildcard == false: addresses must be exactly the same, i.e.
- * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
- * and 0.0.0.0 equals to 0.0.0.0 only
- */
-int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
- bool match_wildcard)
-{
- const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
- int sk2_ipv6only = inet_v6_ipv6only(sk2);
- int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
- int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
-
- /* if both are mapped, treat as IPv4 */
- if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
- if (!sk2_ipv6only) {
- if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
- return 1;
- if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
- return match_wildcard;
- }
- return 0;
- }
-
- if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
- return 1;
-
- if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
- !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
- return 1;
-
- if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
- !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
- return 1;
-
- if (sk2_rcv_saddr6 &&
- ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6))
- return 1;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(ipv6_rcv_saddr_equal);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ef5485204522..d4bf2c68a545 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -318,6 +318,16 @@ static int fib6_dump_node(struct fib6_walker *w)
w->leaf = rt;
return 1;
}
+
+ /* Multipath routes are dumped in one route with the
+ * RTA_MULTIPATH attribute. Jump 'rt' to point to the
+ * last sibling of this route (no need to dump the
+ * sibling routes again)
+ */
+ if (rt->rt6i_nsiblings)
+ rt = list_last_entry(&rt->rt6i_siblings,
+ struct rt6_info,
+ rt6i_siblings);
}
w->leaf = NULL;
return 0;
@@ -746,6 +756,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
u16 nlflags = NLM_F_EXCL;
int err;
+ if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
+ nlflags |= NLM_F_APPEND;
+
ins = &fn->leaf;
for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) {
@@ -868,7 +881,8 @@ add:
*ins = rt;
rt->rt6i_node = fn;
atomic_inc(&rt->rt6i_ref);
- inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
+ if (!info->skip_notify)
+ inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
if (!(fn->fn_flags & RTN_RTINFO)) {
@@ -894,7 +908,8 @@ add:
rt->rt6i_node = fn;
rt->dst.rt6_next = iter->dst.rt6_next;
atomic_inc(&rt->rt6i_ref);
- inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
+ if (!info->skip_notify)
+ inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
if (!(fn->fn_flags & RTN_RTINFO)) {
info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
fn->fn_flags |= RTN_RTINFO;
@@ -908,6 +923,8 @@ add:
ins = &rt->dst.rt6_next;
iter = *ins;
while (iter) {
+ if (iter->rt6i_metric > rt->rt6i_metric)
+ break;
if (rt6_qualify_for_ecmp(iter)) {
*ins = iter->dst.rt6_next;
fib6_purge_rt(iter, fn, info->nl_net);
@@ -1439,7 +1456,8 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
fib6_purge_rt(rt, fn, net);
- inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
+ if (!info->skip_notify)
+ inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
rt6_release(rt);
}
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index b912f0dbaf72..8081bafe441b 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -29,7 +29,7 @@
#include <net/rawv6.h>
#include <net/transp_v6.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified
in old IPv6 RFC. Well, it was reasonable value.
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index d7d6d3ae0b3b..6fcb7cb49bb2 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -64,7 +64,7 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
#define IP6_GRE_HASH_SIZE_SHIFT 5
#define IP6_GRE_HASH_SIZE (1 << IP6_GRE_HASH_SIZE_SHIFT)
-static int ip6gre_net_id __read_mostly;
+static unsigned int ip6gre_net_id __read_mostly;
struct ip6gre_net {
struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE];
@@ -367,35 +367,37 @@ static void ip6gre_tunnel_uninit(struct net_device *dev)
static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
- u8 type, u8 code, int offset, __be32 info)
+ u8 type, u8 code, int offset, __be32 info)
{
- const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data;
- __be16 *p = (__be16 *)(skb->data + offset);
- int grehlen = offset + 4;
+ const struct gre_base_hdr *greh;
+ const struct ipv6hdr *ipv6h;
+ int grehlen = sizeof(*greh);
struct ip6_tnl *t;
+ int key_off = 0;
__be16 flags;
+ __be32 key;
- flags = p[0];
- if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
- if (flags&(GRE_VERSION|GRE_ROUTING))
- return;
- if (flags&GRE_KEY) {
- grehlen += 4;
- if (flags&GRE_CSUM)
- grehlen += 4;
- }
+ if (!pskb_may_pull(skb, offset + grehlen))
+ return;
+ greh = (const struct gre_base_hdr *)(skb->data + offset);
+ flags = greh->flags;
+ if (flags & (GRE_VERSION | GRE_ROUTING))
+ return;
+ if (flags & GRE_CSUM)
+ grehlen += 4;
+ if (flags & GRE_KEY) {
+ key_off = grehlen + offset;
+ grehlen += 4;
}
- /* If only 8 bytes returned, keyed message will be dropped here */
- if (!pskb_may_pull(skb, grehlen))
+ if (!pskb_may_pull(skb, offset + grehlen))
return;
ipv6h = (const struct ipv6hdr *)skb->data;
- p = (__be16 *)(skb->data + offset);
+ greh = (const struct gre_base_hdr *)(skb->data + offset);
+ key = key_off ? *(__be32 *)(skb->data + key_off) : 0;
t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
- flags & GRE_KEY ?
- *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
- p[1]);
+ key, greh->protocol);
if (!t)
return;
@@ -484,11 +486,6 @@ drop:
return 0;
}
-struct ipv6_tel_txoption {
- struct ipv6_txoptions ops;
- __u8 dst_opt[8];
-};
-
static int gre_handle_offloads(struct sk_buff *skb, bool csum)
{
return iptunnel_handle_offloads(skb,
@@ -548,6 +545,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
if (err)
return -1;
@@ -580,6 +579,9 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
return -1;
offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
+ /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
+ ipv6h = ipv6_hdr(skb);
+
if (offset > 0) {
struct ipv6_tlv_tnl_enc_lim *tel;
tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
@@ -602,6 +604,8 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)))
return -1;
@@ -994,6 +998,9 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
dev->flags |= IFF_NOARP;
dev->addr_len = sizeof(struct in6_addr);
netif_keep_dst(dev);
+ /* This perm addr will be used as interface identifier by IPv6 */
+ dev->addr_assign_type = NET_ADDR_RANDOM;
+ eth_random_addr(dev->perm_addr);
}
static int ip6gre_tunnel_init_common(struct net_device *dev)
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index aacfb4bce153..c45b12b4431c 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -122,11 +122,14 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
/*
* RFC4291 2.5.3
+ * The loopback address must not be used as the source address in IPv6
+ * packets that are sent outside of a single node. [..]
* A packet received on an interface with a destination address
* of loopback must be dropped.
*/
- if (!(dev->flags & IFF_LOOPBACK) &&
- ipv6_addr_loopback(&hdr->daddr))
+ if ((ipv6_addr_loopback(&hdr->saddr) ||
+ ipv6_addr_loopback(&hdr->daddr)) &&
+ !(dev->flags & IFF_LOOPBACK))
goto err;
/* RFC4291 Errata ID: 3480
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 89c59e656f44..93e58a5e1837 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -191,6 +191,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
ops = rcu_dereference(inet6_offloads[proto]);
if (!ops || !ops->callbacks.gro_receive) {
__pskb_pull(skb, skb_gro_offset(skb));
+ skb_gro_frag0_invalidate(skb);
proto = ipv6_gso_pull_exthdrs(skb, proto);
skb_gro_pull(skb, -skb_transport_offset(skb));
skb_reset_transport_header(skb);
@@ -252,7 +253,7 @@ out_unlock:
rcu_read_unlock();
out:
- NAPI_GRO_CB(skb)->flush |= flush;
+ skb_gro_flush_final(skb, pp, flush);
return pp;
}
@@ -293,8 +294,10 @@ static int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff);
int err = -ENOSYS;
- if (skb->encapsulation)
+ if (skb->encapsulation) {
+ skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6));
skb_set_inner_network_header(skb, nhoff);
+ }
iph->payload_len = htons(skb->len - nhoff - sizeof(*iph));
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 59eb4ed99ce8..58f6288e9ba5 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -39,6 +39,7 @@
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/bpf-cgroup.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
@@ -118,7 +119,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
if (unlikely(!neigh))
neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
if (!IS_ERR(neigh)) {
- ret = dst_neigh_output(dst, neigh, skb);
+ sock_confirm_neigh(skb, neigh);
+ ret = neigh_output(neigh, skb);
rcu_read_unlock_bh();
return ret;
}
@@ -131,6 +133,14 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ if (ret) {
+ kfree_skb(skb);
+ return ret;
+ }
+
if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
dst_allfrag(skb_dst(skb)) ||
(IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
@@ -163,7 +173,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
* which are using proper atomic operations or spinlocks.
*/
int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
- struct ipv6_txoptions *opt, int tclass)
+ __u32 mark, struct ipv6_txoptions *opt, int tclass)
{
struct net *net = sock_net(sk);
const struct ipv6_pinfo *np = inet6_sk(sk);
@@ -203,7 +213,8 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
if (opt->opt_flen)
ipv6_push_frag_opts(skb, opt, &proto);
if (opt->opt_nflen)
- ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
+ ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
+ &fl6->saddr);
}
skb_push(skb, sizeof(struct ipv6hdr));
@@ -230,7 +241,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
skb->protocol = htons(ETH_P_IPV6);
skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
+ skb->mark = mark;
mtu = dst_mtu(dst);
if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
@@ -624,7 +635,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
hroom = LL_RESERVED_SPACE(rt->dst.dev);
if (skb_has_frag_list(skb)) {
- int first_len = skb_pagelen(skb);
+ unsigned int first_len = skb_pagelen(skb);
struct sk_buff *frag2;
if (first_len - hlen > mtu ||
@@ -757,13 +768,14 @@ slow_path:
* Fragment the datagram.
*/
- *prevhdr = NEXTHDR_FRAGMENT;
troom = rt->dst.dev->needed_tailroom;
/*
* Keep copying data until we run out.
*/
while (left > 0) {
+ u8 *fragnexthdr_offset;
+
len = left;
/* IF: it doesn't fit, use 'mtu' - the data space left */
if (len > mtu)
@@ -808,6 +820,10 @@ slow_path:
*/
skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
+ fragnexthdr_offset = skb_network_header(frag);
+ fragnexthdr_offset += prevhdr - skb_network_header(skb);
+ *fragnexthdr_offset = NEXTHDR_FRAGMENT;
+
/*
* Build fragment header.
*/
@@ -1011,6 +1027,11 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
}
}
#endif
+ if (ipv6_addr_v4mapped(&fl6->saddr) &&
+ !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
+ err = -EAFNOSUPPORT;
+ goto out_err_release;
+ }
return 0;
@@ -1134,6 +1155,9 @@ static inline int ip6_ufo_append_data(struct sock *sk,
skb->protocol = htons(ETH_P_IPV6);
skb->csum = 0;
+ if (flags & MSG_CONFIRM)
+ skb_set_dst_pending_confirm(skb, 1);
+
__skb_queue_tail(queue, skb);
} else if (skb_is_gso(skb)) {
goto append;
@@ -1334,7 +1358,7 @@ emsgsize:
*/
if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
headersize == sizeof(struct ipv6hdr) &&
- length < mtu - headersize &&
+ length <= mtu - headersize &&
!(flags & MSG_MORE) &&
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL;
@@ -1363,10 +1387,10 @@ emsgsize:
*/
cork->length += length;
- if (((length > mtu) ||
+ if ((((length + fragheaderlen) > mtu) ||
(skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
- (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
+ (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
(sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
hh_len, fragheaderlen, exthdrlen,
@@ -1506,6 +1530,9 @@ alloc_new_skb:
exthdrlen = 0;
dst_exthdrlen = 0;
+ if ((flags & MSG_CONFIRM) && !skb_prev)
+ skb_set_dst_pending_confirm(skb, 1);
+
/*
* Put the packet on the pending queue
*/
@@ -1672,7 +1699,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
if (opt && opt->opt_flen)
ipv6_push_frag_opts(skb, opt, &proto);
if (opt && opt->opt_nflen)
- ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
+ ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index d76674efe523..75fac933c209 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -42,7 +42,7 @@
#include <linux/hash.h>
#include <linux/etherdevice.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <net/icmp.h>
@@ -83,7 +83,7 @@ static int ip6_tnl_dev_init(struct net_device *dev);
static void ip6_tnl_dev_setup(struct net_device *dev);
static struct rtnl_link_ops ip6_link_ops __read_mostly;
-static int ip6_tnl_net_id __read_mostly;
+static unsigned int ip6_tnl_net_id __read_mostly;
struct ip6_tnl_net {
/* the IPv6 tunnel fallback device */
struct net_device *fb_tnl_dev;
@@ -400,18 +400,19 @@ ip6_tnl_dev_uninit(struct net_device *dev)
__u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
{
- const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) raw;
- __u8 nexthdr = ipv6h->nexthdr;
- __u16 off = sizeof(*ipv6h);
+ const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw;
+ unsigned int nhoff = raw - skb->data;
+ unsigned int off = nhoff + sizeof(*ipv6h);
+ u8 next, nexthdr = ipv6h->nexthdr;
while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) {
- __u16 optlen = 0;
struct ipv6_opt_hdr *hdr;
- if (raw + off + sizeof(*hdr) > skb->data &&
- !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr)))
+ u16 optlen;
+
+ if (!pskb_may_pull(skb, off + sizeof(*hdr)))
break;
- hdr = (struct ipv6_opt_hdr *) (raw + off);
+ hdr = (struct ipv6_opt_hdr *)(skb->data + off);
if (nexthdr == NEXTHDR_FRAGMENT) {
struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr;
if (frag_hdr->frag_off)
@@ -422,20 +423,29 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
} else {
optlen = ipv6_optlen(hdr);
}
+ /* cache hdr->nexthdr, since pskb_may_pull() might
+ * invalidate hdr
+ */
+ next = hdr->nexthdr;
if (nexthdr == NEXTHDR_DEST) {
- __u16 i = off + 2;
+ u16 i = 2;
+
+ /* Remember : hdr is no longer valid at this point. */
+ if (!pskb_may_pull(skb, off + optlen))
+ break;
+
while (1) {
struct ipv6_tlv_tnl_enc_lim *tel;
/* No more room for encapsulation limit */
- if (i + sizeof (*tel) > off + optlen)
+ if (i + sizeof(*tel) > optlen)
break;
- tel = (struct ipv6_tlv_tnl_enc_lim *) &raw[i];
+ tel = (struct ipv6_tlv_tnl_enc_lim *)(skb->data + off + i);
/* return index of option if found and valid */
if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT &&
tel->length == 1)
- return i;
+ return i + off - nhoff;
/* else jump to next option */
if (tel->type)
i += tel->length + 2;
@@ -443,7 +453,7 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
i++;
}
}
- nexthdr = hdr->nexthdr;
+ nexthdr = next;
off += optlen;
}
return 0;
@@ -1108,7 +1118,7 @@ route_lookup:
t->parms.name);
goto tx_err_dst_release;
}
- mtu = dst_mtu(dst) - psh_hlen;
+ mtu = dst_mtu(dst) - psh_hlen - t->tun_hlen;
if (encap_limit >= 0) {
max_headroom += 8;
mtu -= 8;
@@ -1117,7 +1127,7 @@ route_lookup:
mtu = IPV6_MIN_MTU;
if (skb_dst(skb) && !t->parms.collect_md)
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
- if (skb->len > mtu && !skb_is_gso(skb)) {
+ if (skb->len - t->tun_hlen > mtu && !skb_is_gso(skb)) {
*pmtu = mtu;
err = -EMSGSIZE;
goto tx_err_dst_release;
@@ -1166,7 +1176,7 @@ route_lookup:
if (encap_limit >= 0) {
init_tel_txopt(&opt, encap_limit);
- ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL);
+ ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL, NULL);
}
/* Calculate max headroom for all the headers and adjust
@@ -1248,6 +1258,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
fl6.flowi6_mark = skb->mark;
}
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1;
@@ -1301,6 +1313,8 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
fl6.flowlabel = key->label;
} else {
offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
+ /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
+ ipv6h = ipv6_hdr(skb);
if (offset > 0) {
struct ipv6_tlv_tnl_enc_lim *tel;
@@ -1326,6 +1340,8 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
fl6.flowi6_mark = skb->mark;
}
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1;
@@ -1645,7 +1661,7 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
struct ip6_tnl *tnl = netdev_priv(dev);
if (tnl->parms.proto == IPPROTO_IPIP) {
- if (new_mtu < 68)
+ if (new_mtu < ETH_MIN_MTU)
return -EINVAL;
} else {
if (new_mtu < IPV6_MIN_MTU)
@@ -1798,6 +1814,8 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
dev->mtu = ETH_DATA_LEN - t_hlen;
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
dev->mtu -= 8;
+ dev->min_mtu = ETH_MIN_MTU;
+ dev->max_mtu = 0xFFF8 - dev->hard_header_len;
return 0;
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index c299c1e2bbf0..3d8a3b63b4fd 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -49,6 +49,7 @@
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <linux/etherdevice.h>
#define IP6_VTI_HASH_SIZE_SHIFT 5
#define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT)
@@ -64,7 +65,7 @@ static int vti6_dev_init(struct net_device *dev);
static void vti6_dev_setup(struct net_device *dev);
static struct rtnl_link_ops vti6_link_ops __read_mostly;
-static int vti6_net_id __read_mostly;
+static unsigned int vti6_net_id __read_mostly;
struct vti6_net {
/* the vti6 tunnel fallback device */
struct net_device *fb_tnl_dev;
@@ -189,12 +190,12 @@ static int vti6_tnl_create2(struct net_device *dev)
struct vti6_net *ip6n = net_generic(net, vti6_net_id);
int err;
+ dev->rtnl_link_ops = &vti6_link_ops;
err = register_netdevice(dev);
if (err < 0)
goto out;
strcpy(t->parms.name, dev->name);
- dev->rtnl_link_ops = &vti6_link_ops;
dev_hold(dev);
vti6_tnl_link(ip6n, t);
@@ -484,11 +485,15 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
if (!skb->ignore_df && skb->len > mtu) {
skb_dst(skb)->ops->update_pmtu(dst, NULL, skb, mtu);
- if (skb->protocol == htons(ETH_P_IPV6))
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- else
+ } else {
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
+ }
return -EMSGSIZE;
}
@@ -608,9 +613,10 @@ static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
@@ -691,6 +697,10 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
u->link = p->link;
u->i_key = p->i_key;
u->o_key = p->o_key;
+ if (u->i_key)
+ u->i_flags |= GRE_KEY;
+ if (u->o_key)
+ u->o_flags |= GRE_KEY;
u->proto = p->proto;
memcpy(u->name, p->name, sizeof(u->name));
@@ -812,30 +822,11 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
return err;
}
-/**
- * vti6_tnl_change_mtu - change mtu manually for tunnel device
- * @dev: virtual device associated with tunnel
- * @new_mtu: the new mtu
- *
- * Return:
- * 0 on success,
- * %-EINVAL if mtu too small
- **/
-static int vti6_change_mtu(struct net_device *dev, int new_mtu)
-{
- if (new_mtu < IPV6_MIN_MTU)
- return -EINVAL;
-
- dev->mtu = new_mtu;
- return 0;
-}
-
static const struct net_device_ops vti6_netdev_ops = {
.ndo_init = vti6_dev_init,
.ndo_uninit = vti6_dev_uninit,
.ndo_start_xmit = vti6_tnl_xmit,
.ndo_do_ioctl = vti6_ioctl,
- .ndo_change_mtu = vti6_change_mtu,
.ndo_get_stats64 = ip_tunnel_get_stats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
@@ -855,9 +846,14 @@ static void vti6_dev_setup(struct net_device *dev)
dev->type = ARPHRD_TUNNEL6;
dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr);
dev->mtu = ETH_DATA_LEN;
+ dev->min_mtu = IPV6_MIN_MTU;
+ dev->max_mtu = IP_MAX_MTU;
dev->flags |= IFF_NOARP;
dev->addr_len = sizeof(struct in6_addr);
netif_keep_dst(dev);
+ /* This perm addr will be used as interface identifier by IPv6 */
+ dev->addr_assign_type = NET_ADDR_RANDOM;
+ eth_random_addr(dev->perm_addr);
}
/**
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 7f4265b1649b..bf34d0950752 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -16,7 +16,7 @@
*
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/errno.h>
@@ -636,7 +636,7 @@ static int pim6_rcv(struct sk_buff *skb)
goto drop;
pim = (struct pimreghdr *)skb_transport_header(skb);
- if (pim->type != ((PIM_VERSION << 4) | PIM_REGISTER) ||
+ if (pim->type != ((PIM_VERSION << 4) | PIM_TYPE_REGISTER) ||
(pim->flags & PIM_NULL_REGISTER) ||
(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
sizeof(*pim), IPPROTO_PIM,
@@ -774,7 +774,8 @@ failure:
* Delete a VIF entry
*/
-static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head)
+static int mif6_delete(struct mr6_table *mrt, int vifi, int notify,
+ struct list_head *head)
{
struct mif_device *v;
struct net_device *dev;
@@ -820,7 +821,7 @@ static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head)
dev->ifindex, &in6_dev->cnf);
}
- if (v->flags & MIFF_REGISTER)
+ if ((v->flags & MIFF_REGISTER) && !notify)
unregister_netdevice_queue(dev, head);
dev_put(dev);
@@ -1331,7 +1332,6 @@ static int ip6mr_device_event(struct notifier_block *this,
struct mr6_table *mrt;
struct mif_device *v;
int ct;
- LIST_HEAD(list);
if (event != NETDEV_UNREGISTER)
return NOTIFY_DONE;
@@ -1340,10 +1340,9 @@ static int ip6mr_device_event(struct notifier_block *this,
v = &mrt->vif6_table[0];
for (ct = 0; ct < mrt->maxvif; ct++, v++) {
if (v->dev == dev)
- mif6_delete(mrt, ct, &list);
+ mif6_delete(mrt, ct, 1, NULL);
}
}
- unregister_netdevice_many(&list);
return NOTIFY_DONE;
}
@@ -1552,7 +1551,7 @@ static void mroute_clean_tables(struct mr6_table *mrt, bool all)
for (i = 0; i < mrt->maxvif; i++) {
if (!all && (mrt->vif6_table[i].flags & VIFF_STATIC))
continue;
- mif6_delete(mrt, i, &list);
+ mif6_delete(mrt, i, 0, &list);
}
unregister_netdevice_many(&list);
@@ -1666,6 +1665,10 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
struct net *net = sock_net(sk);
struct mr6_table *mrt;
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+ return -EOPNOTSUPP;
+
mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
if (!mrt)
return -ENOENT;
@@ -1677,9 +1680,6 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
switch (optname) {
case MRT6_INIT:
- if (sk->sk_type != SOCK_RAW ||
- inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
- return -EOPNOTSUPP;
if (optlen < sizeof(int))
return -EINVAL;
@@ -1706,7 +1706,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
if (copy_from_user(&mifi, optval, sizeof(mifi_t)))
return -EFAULT;
rtnl_lock();
- ret = mif6_delete(mrt, mifi, NULL);
+ ret = mif6_delete(mrt, mifi, 0, NULL);
rtnl_unlock();
return ret;
@@ -1815,6 +1815,10 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
struct net *net = sock_net(sk);
struct mr6_table *mrt;
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+ return -EOPNOTSUPP;
+
mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
if (!mrt)
return -ENOENT;
@@ -2243,8 +2247,10 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
int ct;
/* If cache is unresolved, don't try to parse IIF and OIF */
- if (c->mf6c_parent >= MAXMIFS)
+ if (c->mf6c_parent >= MAXMIFS) {
+ rtm->rtm_flags |= RTNH_F_UNRESOLVED;
return -ENOENT;
+ }
if (MIF_EXISTS(mrt, c->mf6c_parent) &&
nla_put_u32(skb, RTA_IIF, mrt->vif6_table[c->mf6c_parent].dev->ifindex) < 0)
@@ -2286,7 +2292,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
}
int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
- int nowait, u32 portid)
+ u32 portid)
{
int err;
struct mr6_table *mrt;
@@ -2313,11 +2319,6 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
struct net_device *dev;
int vif;
- if (nowait) {
- read_unlock(&mrt_lock);
- return -EAGAIN;
- }
-
dev = skb->dev;
if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) {
read_unlock(&mrt_lock);
@@ -2355,7 +2356,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
return err;
}
- if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
+ if (rtm->rtm_flags & RTM_F_NOTIFY)
cache->mfc_flags |= MFC_NOTIFY;
err = __ip6mr_fill_mroute(mrt, skb, cache, rtm);
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 1b9316e1386a..54d165b9845a 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -74,9 +74,10 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 636ec56f5f50..a531ba032b85 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -52,8 +52,9 @@
#include <net/udplite.h>
#include <net/xfrm.h>
#include <net/compat.h>
+#include <net/seg6.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
struct ip6_ra_chain *ip6_ra_chain;
DEFINE_RWLOCK(ip6_ra_lock);
@@ -430,6 +431,15 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
break;
#endif
+ case IPV6_SRCRT_TYPE_4:
+ {
+ struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)
+ opt->srcrt;
+
+ if (!seg6_validate_srh(srh, optlen))
+ goto sticky_done;
+ break;
+ }
default:
goto sticky_done;
}
@@ -585,16 +595,24 @@ done:
if (val) {
struct net_device *dev;
+ int midx;
- if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val)
- goto e_inval;
+ rcu_read_lock();
- dev = dev_get_by_index(net, val);
+ dev = dev_get_by_index_rcu(net, val);
if (!dev) {
+ rcu_read_unlock();
retv = -ENODEV;
break;
}
- dev_put(dev);
+ midx = l3mdev_master_ifindex_rcu(dev);
+
+ rcu_read_unlock();
+
+ if (sk->sk_bound_dev_if &&
+ sk->sk_bound_dev_if != val &&
+ (!midx || midx != sk->sk_bound_dev_if))
+ goto e_inval;
}
np->mcast_oif = val;
retv = 0;
@@ -868,6 +886,10 @@ pref_skip_coa:
np->autoflowlabel = valbool;
retv = 0;
break;
+ case IPV6_RECVFRAGSIZE:
+ np->rxopt.bits.recvfragsize = valbool;
+ retv = 0;
+ break;
}
release_sock(sk);
@@ -1310,6 +1332,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
val = np->autoflowlabel;
break;
+ case IPV6_RECVFRAGSIZE:
+ val = np->rxopt.bits.recvfragsize;
+ break;
+
default:
return -ENOPROTOOPT;
}
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 14a3903f1c82..1bdc703cb966 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -81,7 +81,7 @@ static void mld_gq_timer_expire(unsigned long data);
static void mld_ifc_timer_expire(unsigned long data);
static void mld_ifc_event(struct inet6_dev *idev);
static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
-static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *addr);
+static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
static void mld_clear_delrec(struct inet6_dev *idev);
static bool mld_in_v1_mode(const struct inet6_dev *idev);
static int sf_setstate(struct ifmcaddr6 *pmc);
@@ -692,9 +692,9 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc)
dev_mc_del(dev, buf);
}
- if (mc->mca_flags & MAF_NOREPORT)
- goto done;
spin_unlock_bh(&mc->mca_lock);
+ if (mc->mca_flags & MAF_NOREPORT)
+ return;
if (!mc->idev->dead)
igmp6_leave_group(mc);
@@ -702,8 +702,6 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc)
spin_lock_bh(&mc->mca_lock);
if (del_timer(&mc->mca_timer))
atomic_dec(&mc->mca_refcnt);
-done:
- ip6_mc_clear_src(mc);
spin_unlock_bh(&mc->mca_lock);
}
@@ -748,10 +746,11 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
spin_unlock_bh(&idev->mc_lock);
}
-static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca)
+static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
{
struct ifmcaddr6 *pmc, *pmc_prev;
- struct ip6_sf_list *psf, *psf_next;
+ struct ip6_sf_list *psf;
+ struct in6_addr *pmca = &im->mca_addr;
spin_lock_bh(&idev->mc_lock);
pmc_prev = NULL;
@@ -768,14 +767,21 @@ static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca)
}
spin_unlock_bh(&idev->mc_lock);
+ spin_lock_bh(&im->mca_lock);
if (pmc) {
- for (psf = pmc->mca_tomb; psf; psf = psf_next) {
- psf_next = psf->sf_next;
- kfree(psf);
+ im->idev = pmc->idev;
+ im->mca_crcount = idev->mc_qrv;
+ im->mca_sfmode = pmc->mca_sfmode;
+ if (pmc->mca_sfmode == MCAST_INCLUDE) {
+ im->mca_tomb = pmc->mca_tomb;
+ im->mca_sources = pmc->mca_sources;
+ for (psf = im->mca_sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = im->mca_crcount;
}
in6_dev_put(pmc->idev);
kfree(pmc);
}
+ spin_unlock_bh(&im->mca_lock);
}
static void mld_clear_delrec(struct inet6_dev *idev)
@@ -904,7 +910,7 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
mca_get(mc);
write_unlock_bh(&idev->lock);
- mld_del_delrec(idev, &mc->mca_addr);
+ mld_del_delrec(idev, mc);
igmp6_group_added(mc);
ma_put(mc);
return 0;
@@ -927,6 +933,7 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr)
write_unlock_bh(&idev->lock);
igmp6_group_dropped(ma);
+ ip6_mc_clear_src(ma);
ma_put(ma);
return 0;
@@ -2501,15 +2508,17 @@ void ipv6_mc_down(struct inet6_dev *idev)
/* Withdraw multicast list */
read_lock_bh(&idev->lock);
- mld_ifc_stop_timer(idev);
- mld_gq_stop_timer(idev);
- mld_dad_stop_timer(idev);
for (i = idev->mc_list; i; i = i->next)
igmp6_group_dropped(i);
- read_unlock_bh(&idev->lock);
- mld_clear_delrec(idev);
+ /* Should stop timer after group drop. or we will
+ * start timer again in mld_ifc_event()
+ */
+ mld_ifc_stop_timer(idev);
+ mld_gq_stop_timer(idev);
+ mld_dad_stop_timer(idev);
+ read_unlock_bh(&idev->lock);
}
static void ipv6_mc_reset(struct inet6_dev *idev)
@@ -2531,8 +2540,10 @@ void ipv6_mc_up(struct inet6_dev *idev)
read_lock_bh(&idev->lock);
ipv6_mc_reset(idev);
- for (i = idev->mc_list; i; i = i->next)
+ for (i = idev->mc_list; i; i = i->next) {
+ mld_del_delrec(idev, i);
igmp6_group_added(i);
+ }
read_unlock_bh(&idev->lock);
}
@@ -2565,6 +2576,7 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev)
/* Deactivate timers */
ipv6_mc_down(idev);
+ mld_clear_delrec(idev);
/* Delete all-nodes address. */
/* We cannot call ipv6_dev_mc_dec() directly, our caller in
@@ -2579,11 +2591,9 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev)
write_lock_bh(&idev->lock);
while ((i = idev->mc_list) != NULL) {
idev->mc_list = i->next;
- write_unlock_bh(&idev->lock);
- igmp6_group_dropped(i);
+ write_unlock_bh(&idev->lock);
ma_put(i);
-
write_lock_bh(&idev->lock);
}
write_unlock_bh(&idev->lock);
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index 60c79a08e14a..64f0f7be9e5e 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -191,7 +191,7 @@ static inline int mip6_report_rl_allow(ktime_t stamp,
int allow = 0;
spin_lock_bh(&mip6_report_rl.lock);
- if (!ktime_equal(mip6_report_rl.stamp, stamp) ||
+ if (mip6_report_rl.stamp != stamp ||
mip6_report_rl.iif != iif ||
!ipv6_addr_equal(&mip6_report_rl.src, src) ||
!ipv6_addr_equal(&mip6_report_rl.dst, dst)) {
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index d8e671457d10..7ebac630d3c6 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -233,6 +233,7 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
case ND_OPT_SOURCE_LL_ADDR:
case ND_OPT_TARGET_LL_ADDR:
case ND_OPT_MTU:
+ case ND_OPT_NONCE:
case ND_OPT_REDIRECT_HDR:
if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
ND_PRINTK(2, warn,
@@ -568,7 +569,8 @@ static void ndisc_send_unsol_na(struct net_device *dev)
}
void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
- const struct in6_addr *daddr, const struct in6_addr *saddr)
+ const struct in6_addr *daddr, const struct in6_addr *saddr,
+ u64 nonce)
{
struct sk_buff *skb;
struct in6_addr addr_buf;
@@ -588,6 +590,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
if (inc_opt)
optlen += ndisc_opt_addr_space(dev,
NDISC_NEIGHBOUR_SOLICITATION);
+ if (nonce != 0)
+ optlen += 8;
skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
if (!skb)
@@ -605,6 +609,13 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
dev->dev_addr,
NDISC_NEIGHBOUR_SOLICITATION);
+ if (nonce != 0) {
+ u8 *opt = skb_put(skb, 8);
+
+ opt[0] = ND_OPT_NONCE;
+ opt[1] = 8 >> 3;
+ memcpy(opt + 2, &nonce, 6);
+ }
ndisc_send_skb(skb, daddr, saddr);
}
@@ -693,12 +704,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
"%s: trying to ucast probe in NUD_INVALID: %pI6\n",
__func__, target);
}
- ndisc_send_ns(dev, target, target, saddr);
+ ndisc_send_ns(dev, target, target, saddr, 0);
} else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) {
neigh_app_ns(neigh);
} else {
addrconf_addr_solict_mult(target, &mcaddr);
- ndisc_send_ns(dev, target, &mcaddr, saddr);
+ ndisc_send_ns(dev, target, &mcaddr, saddr, 0);
}
}
@@ -742,6 +753,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
int dad = ipv6_addr_any(saddr);
bool inc;
int is_router = -1;
+ u64 nonce = 0;
if (skb->len < sizeof(struct nd_msg)) {
ND_PRINTK(2, warn, "NS: packet too short\n");
@@ -786,6 +798,8 @@ static void ndisc_recv_ns(struct sk_buff *skb)
return;
}
}
+ if (ndopts.nd_opts_nonce)
+ memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6);
inc = ipv6_addr_is_multicast(daddr);
@@ -794,6 +808,15 @@ static void ndisc_recv_ns(struct sk_buff *skb)
have_ifp:
if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) {
if (dad) {
+ if (nonce != 0 && ifp->dad_nonce == nonce) {
+ u8 *np = (u8 *)&nonce;
+ /* Matching nonce if looped back */
+ ND_PRINTK(2, notice,
+ "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n",
+ ifp->idev->dev->name,
+ &ifp->addr, np);
+ goto out;
+ }
/*
* We are colliding with another node
* who is doing DAD
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index d11c46833d61..39970e212ad5 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -26,6 +26,7 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
struct flowi6 fl6 = {
.flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
.flowi6_mark = skb->mark,
+ .flowi6_uid = sock_net_uid(net, skb->sk),
.daddr = iph->daddr,
.saddr = iph->saddr,
};
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index e10a04c9cdc7..6acb2eecd986 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -25,6 +25,12 @@ config NF_CONNTRACK_IPV6
To compile it as a module, choose M here. If unsure, say N.
+config NF_SOCKET_IPV6
+ tristate "IPv6 socket lookup support"
+ help
+ This option enables the IPv6 socket lookup infrastructure. This
+ is used by the ip6tables socket match.
+
if NF_TABLES
config NF_TABLES_IPV6
@@ -54,6 +60,14 @@ config NFT_DUP_IPV6
help
This module enables IPv6 packet duplication support for nf_tables.
+config NFT_FIB_IPV6
+ tristate "nf_tables fib / ipv6 route lookup support"
+ select NFT_FIB
+ help
+ This module enables IPv6 FIB lookups, e.g. for reverse path filtering.
+ It also allows query of the FIB for the route type, e.g. local, unicast,
+ multicast or blackhole.
+
endif # NF_TABLES_IPV6
endif # NF_TABLES
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index b4f7d0b4e2af..fe180c96040e 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -24,6 +24,8 @@ obj-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
+obj-$(CONFIG_NF_SOCKET_IPV6) += nf_socket_ipv6.o
+
# logging
obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o
@@ -40,6 +42,7 @@ obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o
obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o
obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o
obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o
+obj-$(CONFIG_NFT_FIB_IPV6) += nft_fib_ipv6.o
# matches
obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 55aacea24396..1e15c54fd5e2 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -24,7 +24,7 @@
#include <linux/icmpv6.h>
#include <net/ipv6.h>
#include <net/compat.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/mutex.h>
#include <linux/proc_fs.h>
#include <linux/err.h>
@@ -291,11 +291,7 @@ ip6t_do_table(struct sk_buff *skb,
* rule is also a fragment-specific rule, non-fragments won't
* match it. */
acpar.hotdrop = false;
- acpar.net = state->net;
- acpar.in = state->in;
- acpar.out = state->out;
- acpar.family = NFPROTO_IPV6;
- acpar.hooknum = hook;
+ acpar.state = state;
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
@@ -566,7 +562,8 @@ static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
static int
find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
- unsigned int size)
+ unsigned int size,
+ struct xt_percpu_counter_alloc_state *alloc_state)
{
struct xt_entry_target *t;
struct xt_target *target;
@@ -574,12 +571,9 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
unsigned int j;
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
- unsigned long pcnt;
- pcnt = xt_percpu_counter_alloc();
- if (IS_ERR_VALUE(pcnt))
+ if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
return -ENOMEM;
- e->counters.pcnt = pcnt;
j = 0;
mtpar.net = net;
@@ -616,7 +610,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
cleanup_match(ematch, net);
}
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
return ret;
}
@@ -703,8 +697,7 @@ static void cleanup_entry(struct ip6t_entry *e, struct net *net)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
-
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
}
/* Checks and translates the user-supplied table segment (held in
@@ -713,6 +706,7 @@ static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
const struct ip6t_replace *repl)
{
+ struct xt_percpu_counter_alloc_state alloc_state = { 0 };
struct ip6t_entry *iter;
unsigned int *offsets;
unsigned int i;
@@ -772,7 +766,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
/* Finally, each sanity check must pass */
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
- ret = find_check_entry(iter, net, repl->name, repl->size);
+ ret = find_check_entry(iter, net, repl->name, repl->size,
+ &alloc_state);
if (ret != 0)
break;
++i;
@@ -860,10 +855,6 @@ copy_entries_to_user(unsigned int total_size,
return PTR_ERR(counters);
loc_cpu_entry = private->entries;
- if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
- ret = -EFAULT;
- goto free_counters;
- }
/* FIXME: use iterator macros --RR */
/* ... then go back and fix counters and names */
@@ -873,6 +864,10 @@ copy_entries_to_user(unsigned int total_size,
const struct xt_entry_target *t;
e = (struct ip6t_entry *)(loc_cpu_entry + off);
+ if (copy_to_user(userptr + off, e, sizeof(*e))) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
if (copy_to_user(userptr + off
+ offsetof(struct ip6t_entry, counters),
&counters[num],
@@ -886,23 +881,14 @@ copy_entries_to_user(unsigned int total_size,
i += m->u.match_size) {
m = (void *)e + i;
- if (copy_to_user(userptr + off + i
- + offsetof(struct xt_entry_match,
- u.user.name),
- m->u.kernel.match->name,
- strlen(m->u.kernel.match->name)+1)
- != 0) {
+ if (xt_match_to_user(m, userptr + off + i)) {
ret = -EFAULT;
goto free_counters;
}
}
t = ip6t_get_target_c(e);
- if (copy_to_user(userptr + off + e->target_offset
- + offsetof(struct xt_entry_target,
- u.user.name),
- t->u.kernel.target->name,
- strlen(t->u.kernel.target->name)+1) != 0) {
+ if (xt_target_to_user(t, userptr + off + e->target_offset)) {
ret = -EFAULT;
goto free_counters;
}
@@ -1007,7 +993,7 @@ static int get_info(struct net *net, void __user *user,
#endif
t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name),
"ip6table_%s", name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
struct ip6t_getinfo info;
const struct xt_table_info *private = t->private;
#ifdef CONFIG_COMPAT
@@ -1037,7 +1023,7 @@ static int get_info(struct net *net, void __user *user,
xt_table_unlock(t);
module_put(t->me);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
#ifdef CONFIG_COMPAT
if (compat)
xt_compat_unlock(AF_INET6);
@@ -1063,7 +1049,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, AF_INET6, get.name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
struct xt_table_info *private = t->private;
if (get.size == private->size)
ret = copy_entries_to_user(private->size,
@@ -1074,7 +1060,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
return ret;
}
@@ -1099,8 +1085,8 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name),
"ip6table_%s", name);
- if (IS_ERR_OR_NULL(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ if (!t) {
+ ret = -ENOENT;
goto free_newinfo_counters_untrans;
}
@@ -1214,8 +1200,8 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
if (IS_ERR(paddc))
return PTR_ERR(paddc);
t = xt_find_table_lock(net, AF_INET6, tmp.name);
- if (IS_ERR_OR_NULL(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ if (!t) {
+ ret = -ENOENT;
goto free;
}
@@ -1651,7 +1637,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
xt_compat_lock(AF_INET6);
t = xt_find_table_lock(net, AF_INET6, get.name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
ret = compat_table_info(private, &info);
@@ -1665,7 +1651,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
xt_compat_unlock(AF_INET6);
return ret;
diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c
index 7f9f45d829d2..2b1a15846f9a 100644
--- a/net/ipv6/netfilter/ip6t_MASQUERADE.c
+++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c
@@ -24,7 +24,7 @@
static unsigned int
masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
- return nf_nat_masquerade_ipv6(skb, par->targinfo, par->out);
+ return nf_nat_masquerade_ipv6(skb, par->targinfo, xt_out(par));
}
static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c
index 590f767db5d4..a379d2f79b19 100644
--- a/net/ipv6/netfilter/ip6t_NPT.c
+++ b/net/ipv6/netfilter/ip6t_NPT.c
@@ -112,6 +112,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = {
.table = "mangle",
.target = ip6t_snpt_tg,
.targetsize = sizeof(struct ip6t_npt_tginfo),
+ .usersize = offsetof(struct ip6t_npt_tginfo, adjustment),
.checkentry = ip6t_npt_checkentry,
.family = NFPROTO_IPV6,
.hooks = (1 << NF_INET_LOCAL_IN) |
@@ -123,6 +124,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = {
.table = "mangle",
.target = ip6t_dnpt_tg,
.targetsize = sizeof(struct ip6t_npt_tginfo),
+ .usersize = offsetof(struct ip6t_npt_tginfo, adjustment),
.checkentry = ip6t_npt_checkentry,
.family = NFPROTO_IPV6,
.hooks = (1 << NF_INET_PRE_ROUTING) |
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index db29bbf41b59..fa51a205918d 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -39,35 +39,40 @@ static unsigned int
reject_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ip6t_reject_info *reject = par->targinfo;
- struct net *net = par->net;
+ struct net *net = xt_net(par);
switch (reject->with) {
case IP6T_ICMP6_NO_ROUTE:
- nf_send_unreach6(net, skb, ICMPV6_NOROUTE, par->hooknum);
+ nf_send_unreach6(net, skb, ICMPV6_NOROUTE, xt_hooknum(par));
break;
case IP6T_ICMP6_ADM_PROHIBITED:
- nf_send_unreach6(net, skb, ICMPV6_ADM_PROHIBITED, par->hooknum);
+ nf_send_unreach6(net, skb, ICMPV6_ADM_PROHIBITED,
+ xt_hooknum(par));
break;
case IP6T_ICMP6_NOT_NEIGHBOUR:
- nf_send_unreach6(net, skb, ICMPV6_NOT_NEIGHBOUR, par->hooknum);
+ nf_send_unreach6(net, skb, ICMPV6_NOT_NEIGHBOUR,
+ xt_hooknum(par));
break;
case IP6T_ICMP6_ADDR_UNREACH:
- nf_send_unreach6(net, skb, ICMPV6_ADDR_UNREACH, par->hooknum);
+ nf_send_unreach6(net, skb, ICMPV6_ADDR_UNREACH,
+ xt_hooknum(par));
break;
case IP6T_ICMP6_PORT_UNREACH:
- nf_send_unreach6(net, skb, ICMPV6_PORT_UNREACH, par->hooknum);
+ nf_send_unreach6(net, skb, ICMPV6_PORT_UNREACH,
+ xt_hooknum(par));
break;
case IP6T_ICMP6_ECHOREPLY:
/* Do nothing */
break;
case IP6T_TCP_RESET:
- nf_send_reset6(net, skb, par->hooknum);
+ nf_send_reset6(net, skb, xt_hooknum(par));
break;
case IP6T_ICMP6_POLICY_FAIL:
- nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, par->hooknum);
+ nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, xt_hooknum(par));
break;
case IP6T_ICMP6_REJECT_ROUTE:
- nf_send_unreach6(net, skb, ICMPV6_REJECT_ROUTE, par->hooknum);
+ nf_send_unreach6(net, skb, ICMPV6_REJECT_ROUTE,
+ xt_hooknum(par));
break;
}
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 06bed74cf5ee..4ef1ddd4bbbd 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -71,8 +71,7 @@ synproxy_send_tcp(struct net *net,
skb_dst_set(nskb, dst);
if (nfct) {
- nskb->nfct = nfct;
- nskb->nfctinfo = ctinfo;
+ nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
nf_conntrack_get(nfct);
}
@@ -121,8 +120,8 @@ synproxy_send_client_synack(struct net *net,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
- niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
}
static void
@@ -244,8 +243,8 @@ synproxy_send_client_ack(struct net *net,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
- niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
}
static bool
@@ -277,12 +276,12 @@ static unsigned int
synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_synproxy_info *info = par->targinfo;
- struct net *net = par->net;
+ struct net *net = xt_net(par);
struct synproxy_net *snet = synproxy_pernet(net);
struct synproxy_options opts = {};
struct tcphdr *th, _th;
- if (nf_ip6_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP))
+ if (nf_ip6_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP))
return NF_DROP;
th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
@@ -440,12 +439,12 @@ static int synproxy_tg6_check(const struct xt_tgchk_param *par)
e->ipv6.invflags & XT_INV_PROTO)
return -EINVAL;
- return nf_ct_l3proto_try_module_get(par->family);
+ return nf_ct_netns_get(par->net, par->family);
}
static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_target synproxy_tg6_reg __read_mostly = {
diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
index 1ee1b25df096..b12e61b7b16c 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -72,10 +72,10 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
return ret;
}
-static bool rpfilter_is_local(const struct sk_buff *skb)
+static bool
+rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in)
{
- const struct rt6_info *rt = (const void *) skb_dst(skb);
- return rt && (rt->rt6i_flags & RTF_LOCAL);
+ return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK;
}
static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
@@ -85,7 +85,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
struct ipv6hdr *iph;
bool invert = info->flags & XT_RPFILTER_INVERT;
- if (rpfilter_is_local(skb))
+ if (rpfilter_is_loopback(skb, xt_in(par)))
return true ^ invert;
iph = ipv6_hdr(skb);
@@ -93,7 +93,8 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (unlikely(saddrtype == IPV6_ADDR_ANY))
return true ^ invert; /* not routable: forward path will drop it */
- return rpfilter_lookup_reverse6(par->net, skb, par->in, info->flags) ^ invert;
+ return rpfilter_lookup_reverse6(xt_net(par), skb, xt_in(par),
+ info->flags) ^ invert;
}
static int rpfilter_check(const struct xt_mtchk_param *par)
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 963ee3848675..4e3402486833 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -34,6 +34,13 @@
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <net/netfilter/nf_log.h>
+static int conntrack6_net_id;
+static DEFINE_MUTEX(register_ipv6_hooks);
+
+struct conntrack6_net {
+ unsigned int users;
+};
+
static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
struct nf_conntrack_tuple *tuple)
{
@@ -308,6 +315,42 @@ static int ipv6_nlattr_tuple_size(void)
}
#endif
+static int ipv6_hooks_register(struct net *net)
+{
+ struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id);
+ int err = 0;
+
+ mutex_lock(&register_ipv6_hooks);
+ cnet->users++;
+ if (cnet->users > 1)
+ goto out_unlock;
+
+ err = nf_defrag_ipv6_enable(net);
+ if (err < 0) {
+ cnet->users = 0;
+ goto out_unlock;
+ }
+
+ err = nf_register_net_hooks(net, ipv6_conntrack_ops,
+ ARRAY_SIZE(ipv6_conntrack_ops));
+ if (err)
+ cnet->users = 0;
+ out_unlock:
+ mutex_unlock(&register_ipv6_hooks);
+ return err;
+}
+
+static void ipv6_hooks_unregister(struct net *net)
+{
+ struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id);
+
+ mutex_lock(&register_ipv6_hooks);
+ if (cnet->users && (--cnet->users == 0))
+ nf_unregister_net_hooks(net, ipv6_conntrack_ops,
+ ARRAY_SIZE(ipv6_conntrack_ops));
+ mutex_unlock(&register_ipv6_hooks);
+}
+
struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
.l3proto = PF_INET6,
.name = "ipv6",
@@ -321,6 +364,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
.nlattr_to_tuple = ipv6_nlattr_to_tuple,
.nla_policy = ipv6_nla_policy,
#endif
+ .net_ns_get = ipv6_hooks_register,
+ .net_ns_put = ipv6_hooks_unregister,
.me = THIS_MODULE,
};
@@ -336,52 +381,51 @@ static struct nf_sockopt_ops so_getorigdst6 = {
.owner = THIS_MODULE,
};
+static struct nf_conntrack_l4proto *builtin_l4proto6[] = {
+ &nf_conntrack_l4proto_tcp6,
+ &nf_conntrack_l4proto_udp6,
+ &nf_conntrack_l4proto_icmpv6,
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+ &nf_conntrack_l4proto_dccp6,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ &nf_conntrack_l4proto_sctp6,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+ &nf_conntrack_l4proto_udplite6,
+#endif
+};
+
static int ipv6_net_init(struct net *net)
{
int ret = 0;
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp6);
- if (ret < 0) {
- pr_err("nf_conntrack_tcp6: pernet registration failed\n");
- goto out;
- }
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp6);
- if (ret < 0) {
- pr_err("nf_conntrack_udp6: pernet registration failed\n");
- goto cleanup_tcp6;
- }
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmpv6);
- if (ret < 0) {
- pr_err("nf_conntrack_icmp6: pernet registration failed\n");
- goto cleanup_udp6;
- }
+ ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto6,
+ ARRAY_SIZE(builtin_l4proto6));
+ if (ret < 0)
+ return ret;
+
ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv6);
if (ret < 0) {
pr_err("nf_conntrack_ipv6: pernet registration failed.\n");
- goto cleanup_icmpv6;
+ nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6,
+ ARRAY_SIZE(builtin_l4proto6));
}
- return 0;
- cleanup_icmpv6:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmpv6);
- cleanup_udp6:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp6);
- cleanup_tcp6:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp6);
- out:
return ret;
}
static void ipv6_net_exit(struct net *net)
{
nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv6);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmpv6);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp6);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp6);
+ nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6,
+ ARRAY_SIZE(builtin_l4proto6));
}
static struct pernet_operations ipv6_net_ops = {
.init = ipv6_net_init,
.exit = ipv6_net_exit,
+ .id = &conntrack6_net_id,
+ .size = sizeof(struct conntrack6_net),
};
static int __init nf_conntrack_l3proto_ipv6_init(void)
@@ -389,7 +433,6 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
int ret = 0;
need_conntrack();
- nf_defrag_ipv6_enable();
ret = nf_register_sockopt(&so_getorigdst6);
if (ret < 0) {
@@ -401,47 +444,20 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
if (ret < 0)
goto cleanup_sockopt;
- ret = nf_register_hooks(ipv6_conntrack_ops,
- ARRAY_SIZE(ipv6_conntrack_ops));
- if (ret < 0) {
- pr_err("nf_conntrack_ipv6: can't register pre-routing defrag "
- "hook.\n");
+ ret = nf_ct_l4proto_register(builtin_l4proto6,
+ ARRAY_SIZE(builtin_l4proto6));
+ if (ret < 0)
goto cleanup_pernet;
- }
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp6);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv6: can't register tcp6 proto.\n");
- goto cleanup_hooks;
- }
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp6);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv6: can't register udp6 proto.\n");
- goto cleanup_tcp6;
- }
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmpv6);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv6: can't register icmpv6 proto.\n");
- goto cleanup_udp6;
- }
ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv6);
if (ret < 0) {
pr_err("nf_conntrack_ipv6: can't register ipv6 proto.\n");
- goto cleanup_icmpv6;
+ goto cleanup_l4proto;
}
return ret;
-
- cleanup_icmpv6:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
- cleanup_udp6:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp6);
- cleanup_tcp6:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
- cleanup_hooks:
- nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops));
+cleanup_l4proto:
+ nf_ct_l4proto_unregister(builtin_l4proto6,
+ ARRAY_SIZE(builtin_l4proto6));
cleanup_pernet:
unregister_pernet_subsys(&ipv6_net_ops);
cleanup_sockopt:
@@ -453,10 +469,8 @@ static void __exit nf_conntrack_l3proto_ipv6_fini(void)
{
synchronize_net();
nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp6);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
- nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops));
+ nf_ct_l4proto_unregister(builtin_l4proto6,
+ ARRAY_SIZE(builtin_l4proto6));
unregister_pernet_subsys(&ipv6_net_ops);
nf_unregister_sockopt(&so_getorigdst6);
}
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index f5a61bc3ec2b..d2c2ccbfbe72 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -145,15 +145,15 @@ static int
icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
struct sk_buff *skb,
unsigned int icmp6off,
- enum ip_conntrack_info *ctinfo,
unsigned int hooknum)
{
struct nf_conntrack_tuple intuple, origtuple;
const struct nf_conntrack_tuple_hash *h;
const struct nf_conntrack_l4proto *inproto;
+ enum ip_conntrack_info ctinfo;
struct nf_conntrack_zone tmp;
- NF_CT_ASSERT(skb->nfct == NULL);
+ NF_CT_ASSERT(!skb_nfct(skb));
/* Are they talking about one of our connections? */
if (!nf_ct_get_tuplepr(skb,
@@ -176,7 +176,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
return -NF_ACCEPT;
}
- *ctinfo = IP_CT_RELATED;
+ ctinfo = IP_CT_RELATED;
h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp),
&intuple);
@@ -185,19 +185,18 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
return -NF_ACCEPT;
} else {
if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
- *ctinfo += IP_CT_IS_REPLY;
+ ctinfo += IP_CT_IS_REPLY;
}
/* Update skb to refer to this connection */
- skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
- skb->nfctinfo = *ctinfo;
+ nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
return NF_ACCEPT;
}
static int
icmpv6_error(struct net *net, struct nf_conn *tmpl,
struct sk_buff *skb, unsigned int dataoff,
- enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
+ u8 pf, unsigned int hooknum)
{
const struct icmp6hdr *icmp6h;
struct icmp6hdr _ih;
@@ -222,9 +221,8 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
type = icmp6h->icmp6_type - 130;
if (type >= 0 && type < sizeof(noct_valid_new) &&
noct_valid_new[type]) {
- skb->nfct = &nf_ct_untracked_get()->ct_general;
- skb->nfctinfo = IP_CT_NEW;
- nf_conntrack_get(skb->nfct);
+ nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
+ nf_conntrack_get(skb_nfct(skb));
return NF_ACCEPT;
}
@@ -232,7 +230,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
if (icmp6h->icmp6_type >= 128)
return NF_ACCEPT;
- return icmpv6_error_message(net, tmpl, skb, dataoff, ctinfo, hooknum);
+ return icmpv6_error_message(net, tmpl, skb, dataoff, hooknum);
}
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 9948b5ce52da..986d4ca38832 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -589,6 +589,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
hdr = ipv6_hdr(skb);
fhdr = (struct frag_hdr *)skb_transport_header(skb);
+ skb_orphan(skb);
fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
if (fq == NULL) {
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index f06b0471f39f..ada60d1a991b 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -30,12 +30,14 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+static DEFINE_MUTEX(defrag6_mutex);
+
static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
struct sk_buff *skb)
{
u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- if (skb->nfct) {
+ if (skb_nfct(skb)) {
enum ip_conntrack_info ctinfo;
const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
@@ -59,7 +61,7 @@ static unsigned int ipv6_defrag(void *priv,
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* Previously seen (loopback)? */
- if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+ if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
return NF_ACCEPT;
#endif
@@ -87,6 +89,19 @@ static struct nf_hook_ops ipv6_defrag_ops[] = {
},
};
+static void __net_exit defrag6_net_exit(struct net *net)
+{
+ if (net->nf.defrag_ipv6) {
+ nf_unregister_net_hooks(net, ipv6_defrag_ops,
+ ARRAY_SIZE(ipv6_defrag_ops));
+ net->nf.defrag_ipv6 = false;
+ }
+}
+
+static struct pernet_operations defrag6_net_ops = {
+ .exit = defrag6_net_exit,
+};
+
static int __init nf_defrag_init(void)
{
int ret = 0;
@@ -96,9 +111,9 @@ static int __init nf_defrag_init(void)
pr_err("nf_defrag_ipv6: can't initialize frag6.\n");
return ret;
}
- ret = nf_register_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
+ ret = register_pernet_subsys(&defrag6_net_ops);
if (ret < 0) {
- pr_err("nf_defrag_ipv6: can't register hooks\n");
+ pr_err("nf_defrag_ipv6: can't register pernet ops\n");
goto cleanup_frag6;
}
return ret;
@@ -111,12 +126,31 @@ cleanup_frag6:
static void __exit nf_defrag_fini(void)
{
- nf_unregister_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
+ unregister_pernet_subsys(&defrag6_net_ops);
nf_ct_frag6_cleanup();
}
-void nf_defrag_ipv6_enable(void)
+int nf_defrag_ipv6_enable(struct net *net)
{
+ int err = 0;
+
+ might_sleep();
+
+ if (net->nf.defrag_ipv6)
+ return 0;
+
+ mutex_lock(&defrag6_mutex);
+ if (net->nf.defrag_ipv6)
+ goto out_unlock;
+
+ err = nf_register_net_hooks(net, ipv6_defrag_ops,
+ ARRAY_SIZE(ipv6_defrag_ops));
+ if (err == 0)
+ net->nf.defrag_ipv6 = true;
+
+ out_unlock:
+ mutex_unlock(&defrag6_mutex);
+ return err;
}
EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable);
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index 4a84b5ad9ecb..888ecd106e5f 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -57,10 +57,9 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
return;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- nf_conntrack_put(skb->nfct);
- skb->nfct = &nf_ct_untracked_get()->ct_general;
- skb->nfctinfo = IP_CT_NEW;
- nf_conntrack_get(skb->nfct);
+ nf_reset(skb);
+ nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
+ nf_conntrack_get(skb_nfct(skb));
#endif
if (hooknum == NF_INET_PRE_ROUTING ||
hooknum == NF_INET_LOCAL_IN) {
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index 57d86066a13b..97c724224da7 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -64,7 +64,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
/* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
- nf_log_buf_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
+ nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
(ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
ih->hop_limit,
@@ -351,7 +351,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
struct nf_log_buf *m;
/* FIXME: Disabled from containers until syslog ns is supported */
- if (!net_eq(net, &init_net))
+ if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
return;
m = nf_log_buf_open();
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index 10090400c72f..eedee5d108d9 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -157,6 +157,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
fl6.fl6_sport = otcph->dest;
fl6.fl6_dport = otcph->source;
fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev);
+ fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark);
security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
@@ -180,6 +181,8 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
skb_dst_set(nskb, dst);
+ nskb->mark = fl6.flowi6_mark;
+
skb_reserve(nskb, hh_len + dst->header_len);
ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
ip6_dst_hoplimit(dst));
diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c
new file mode 100644
index 000000000000..ebb2bf84232a
--- /dev/null
+++ b/net/ipv6/netfilter/nf_socket_ipv6.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright (C) 2007-2008 BalaBit IT Ltd.
+ * Author: Krisztian Kovacs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <net/inet6_hashtables.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/netfilter/nf_socket.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static int
+extract_icmp6_fields(const struct sk_buff *skb,
+ unsigned int outside_hdrlen,
+ int *protocol,
+ const struct in6_addr **raddr,
+ const struct in6_addr **laddr,
+ __be16 *rport,
+ __be16 *lport,
+ struct ipv6hdr *ipv6_var)
+{
+ const struct ipv6hdr *inside_iph;
+ struct icmp6hdr *icmph, _icmph;
+ __be16 *ports, _ports[2];
+ u8 inside_nexthdr;
+ __be16 inside_fragoff;
+ int inside_hdrlen;
+
+ icmph = skb_header_pointer(skb, outside_hdrlen,
+ sizeof(_icmph), &_icmph);
+ if (icmph == NULL)
+ return 1;
+
+ if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK)
+ return 1;
+
+ inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph),
+ sizeof(*ipv6_var), ipv6_var);
+ if (inside_iph == NULL)
+ return 1;
+ inside_nexthdr = inside_iph->nexthdr;
+
+ inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) +
+ sizeof(*ipv6_var),
+ &inside_nexthdr, &inside_fragoff);
+ if (inside_hdrlen < 0)
+ return 1; /* hjm: Packet has no/incomplete transport layer headers. */
+
+ if (inside_nexthdr != IPPROTO_TCP &&
+ inside_nexthdr != IPPROTO_UDP)
+ return 1;
+
+ ports = skb_header_pointer(skb, inside_hdrlen,
+ sizeof(_ports), &_ports);
+ if (ports == NULL)
+ return 1;
+
+ /* the inside IP packet is the one quoted from our side, thus
+ * its saddr is the local address */
+ *protocol = inside_nexthdr;
+ *laddr = &inside_iph->saddr;
+ *lport = ports[0];
+ *raddr = &inside_iph->daddr;
+ *rport = ports[1];
+
+ return 0;
+}
+
+static struct sock *
+nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
+ const u8 protocol,
+ const struct in6_addr *saddr, const struct in6_addr *daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in)
+{
+ switch (protocol) {
+ case IPPROTO_TCP:
+ return inet6_lookup(net, &tcp_hashinfo, skb, doff,
+ saddr, sport, daddr, dport,
+ in->ifindex);
+ case IPPROTO_UDP:
+ return udp6_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ }
+
+ return NULL;
+}
+
+struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
+ const struct net_device *indev)
+{
+ __be16 uninitialized_var(dport), uninitialized_var(sport);
+ const struct in6_addr *daddr = NULL, *saddr = NULL;
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct sk_buff *data_skb = NULL;
+ int doff = 0;
+ int thoff = 0, tproto;
+
+ tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
+ if (tproto < 0) {
+ pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+ return NULL;
+ }
+
+ if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) {
+ struct udphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return NULL;
+
+ saddr = &iph->saddr;
+ sport = hp->source;
+ daddr = &iph->daddr;
+ dport = hp->dest;
+ data_skb = (struct sk_buff *)skb;
+ doff = tproto == IPPROTO_TCP ?
+ thoff + __tcp_hdrlen((struct tcphdr *)hp) :
+ thoff + sizeof(*hp);
+
+ } else if (tproto == IPPROTO_ICMPV6) {
+ struct ipv6hdr ipv6_var;
+
+ if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
+ &sport, &dport, &ipv6_var))
+ return NULL;
+ } else {
+ return NULL;
+ }
+
+ return nf_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr,
+ sport, dport, indev);
+}
+EXPORT_SYMBOL_GPL(nf_sk_lookup_slow_v6);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
+MODULE_DESCRIPTION("Netfilter IPv6 socket lookup infrastructure");
diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c
index 831f86e1ec08..d8b5b60b7d53 100644
--- a/net/ipv6/netfilter/nft_dup_ipv6.c
+++ b/net/ipv6/netfilter/nft_dup_ipv6.c
@@ -28,7 +28,7 @@ static void nft_dup_ipv6_eval(const struct nft_expr *expr,
struct in6_addr *gw = (struct in6_addr *)&regs->data[priv->sreg_addr];
int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1;
- nf_dup_ipv6(pkt->net, pkt->skb, pkt->hook, gw, oif);
+ nf_dup_ipv6(nft_net(pkt), pkt->skb, nft_hook(pkt), gw, oif);
}
static int nft_dup_ipv6_init(const struct nft_ctx *ctx,
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
new file mode 100644
index 000000000000..765facf03d45
--- /dev/null
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -0,0 +1,270 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_fib.h>
+
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+
+static int get_ifindex(const struct net_device *dev)
+{
+ return dev ? dev->ifindex : 0;
+}
+
+static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv,
+ const struct nft_pktinfo *pkt,
+ const struct net_device *dev)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(pkt->skb);
+ int lookup_flags = 0;
+
+ if (priv->flags & NFTA_FIB_F_DADDR) {
+ fl6->daddr = iph->daddr;
+ fl6->saddr = iph->saddr;
+ } else {
+ fl6->daddr = iph->saddr;
+ fl6->saddr = iph->daddr;
+ }
+
+ if (ipv6_addr_type(&fl6->daddr) & IPV6_ADDR_LINKLOCAL) {
+ lookup_flags |= RT6_LOOKUP_F_IFACE;
+ fl6->flowi6_oif = get_ifindex(dev ? dev : pkt->skb->dev);
+ }
+
+ if (ipv6_addr_type(&fl6->saddr) & IPV6_ADDR_UNICAST)
+ lookup_flags |= RT6_LOOKUP_F_HAS_SADDR;
+
+ if (priv->flags & NFTA_FIB_F_MARK)
+ fl6->flowi6_mark = pkt->skb->mark;
+
+ fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK;
+
+ return lookup_flags;
+}
+
+static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
+ const struct nft_pktinfo *pkt)
+{
+ const struct net_device *dev = NULL;
+ const struct nf_ipv6_ops *v6ops;
+ const struct nf_afinfo *afinfo;
+ int route_err, addrtype;
+ struct rt6_info *rt;
+ struct flowi6 fl6 = {
+ .flowi6_iif = LOOPBACK_IFINDEX,
+ .flowi6_proto = pkt->tprot,
+ };
+ u32 ret = 0;
+
+ afinfo = nf_get_afinfo(NFPROTO_IPV6);
+ if (!afinfo)
+ return RTN_UNREACHABLE;
+
+ if (priv->flags & NFTA_FIB_F_IIF)
+ dev = nft_in(pkt);
+ else if (priv->flags & NFTA_FIB_F_OIF)
+ dev = nft_out(pkt);
+
+ nft_fib6_flowi_init(&fl6, priv, pkt, dev);
+
+ v6ops = nf_get_ipv6_ops();
+ if (dev && v6ops && v6ops->chk_addr(nft_net(pkt), &fl6.daddr, dev, true))
+ ret = RTN_LOCAL;
+
+ route_err = afinfo->route(nft_net(pkt), (struct dst_entry **)&rt,
+ flowi6_to_flowi(&fl6), false);
+ if (route_err)
+ goto err;
+
+ if (rt->rt6i_flags & RTF_REJECT) {
+ route_err = rt->dst.error;
+ dst_release(&rt->dst);
+ goto err;
+ }
+
+ if (ipv6_anycast_destination((struct dst_entry *)rt, &fl6.daddr))
+ ret = RTN_ANYCAST;
+ else if (!dev && rt->rt6i_flags & RTF_LOCAL)
+ ret = RTN_LOCAL;
+
+ dst_release(&rt->dst);
+
+ if (ret)
+ return ret;
+
+ addrtype = ipv6_addr_type(&fl6.daddr);
+
+ if (addrtype & IPV6_ADDR_MULTICAST)
+ return RTN_MULTICAST;
+ if (addrtype & IPV6_ADDR_UNICAST)
+ return RTN_UNICAST;
+
+ return RTN_UNSPEC;
+ err:
+ switch (route_err) {
+ case -EINVAL:
+ return RTN_BLACKHOLE;
+ case -EACCES:
+ return RTN_PROHIBIT;
+ case -EAGAIN:
+ return RTN_THROW;
+ default:
+ break;
+ }
+
+ return RTN_UNREACHABLE;
+}
+
+void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+
+ *dest = __nft_fib6_eval_type(priv, pkt);
+}
+EXPORT_SYMBOL_GPL(nft_fib6_eval_type);
+
+void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ const struct net_device *oif = NULL;
+ u32 *dest = &regs->data[priv->dreg];
+ struct flowi6 fl6 = {
+ .flowi6_iif = LOOPBACK_IFINDEX,
+ .flowi6_proto = pkt->tprot,
+ };
+ struct rt6_info *rt;
+ int lookup_flags;
+
+ if (priv->flags & NFTA_FIB_F_IIF)
+ oif = nft_in(pkt);
+ else if (priv->flags & NFTA_FIB_F_OIF)
+ oif = nft_out(pkt);
+
+ lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif);
+
+ if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
+ nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
+ nft_fib_store_result(dest, priv->result, pkt,
+ nft_in(pkt)->ifindex);
+ return;
+ }
+
+ *dest = 0;
+ again:
+ rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, lookup_flags);
+ if (rt->dst.error)
+ goto put_rt_err;
+
+ /* Should not see RTF_LOCAL here */
+ if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL))
+ goto put_rt_err;
+
+ if (oif && oif != rt->rt6i_idev->dev) {
+ /* multipath route? Try again with F_IFACE */
+ if ((lookup_flags & RT6_LOOKUP_F_IFACE) == 0) {
+ lookup_flags |= RT6_LOOKUP_F_IFACE;
+ fl6.flowi6_oif = oif->ifindex;
+ ip6_rt_put(rt);
+ goto again;
+ }
+ }
+
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF:
+ *dest = rt->rt6i_idev->dev->ifindex;
+ break;
+ case NFT_FIB_RESULT_OIFNAME:
+ strncpy((char *)dest, rt->rt6i_idev->dev->name, IFNAMSIZ);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ put_rt_err:
+ ip6_rt_put(rt);
+}
+EXPORT_SYMBOL_GPL(nft_fib6_eval);
+
+static struct nft_expr_type nft_fib6_type;
+
+static const struct nft_expr_ops nft_fib6_type_ops = {
+ .type = &nft_fib6_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+ .eval = nft_fib6_eval_type,
+ .init = nft_fib_init,
+ .dump = nft_fib_dump,
+ .validate = nft_fib_validate,
+};
+
+static const struct nft_expr_ops nft_fib6_ops = {
+ .type = &nft_fib6_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+ .eval = nft_fib6_eval,
+ .init = nft_fib_init,
+ .dump = nft_fib_dump,
+ .validate = nft_fib_validate,
+};
+
+static const struct nft_expr_ops *
+nft_fib6_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ enum nft_fib_result result;
+
+ if (!tb[NFTA_FIB_RESULT])
+ return ERR_PTR(-EINVAL);
+
+ result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
+
+ switch (result) {
+ case NFT_FIB_RESULT_OIF:
+ return &nft_fib6_ops;
+ case NFT_FIB_RESULT_OIFNAME:
+ return &nft_fib6_ops;
+ case NFT_FIB_RESULT_ADDRTYPE:
+ return &nft_fib6_type_ops;
+ default:
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+}
+
+static struct nft_expr_type nft_fib6_type __read_mostly = {
+ .name = "fib",
+ .select_ops = &nft_fib6_select_ops,
+ .policy = nft_fib_policy,
+ .maxattr = NFTA_FIB_MAX,
+ .family = NFPROTO_IPV6,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_fib6_module_init(void)
+{
+ return nft_register_expr(&nft_fib6_type);
+}
+
+static void __exit nft_fib6_module_exit(void)
+{
+ nft_unregister_expr(&nft_fib6_type);
+}
+module_init(nft_fib6_module_init);
+module_exit(nft_fib6_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_ALIAS_NFT_AF_EXPR(10, "fib");
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index 9597ffb74077..4146536e9c15 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -27,12 +27,19 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
memset(&range, 0, sizeof(range));
range.flags = priv->flags;
if (priv->sreg_proto_min) {
- range.min_proto.all =
- *(__be16 *)&regs->data[priv->sreg_proto_min];
- range.max_proto.all =
- *(__be16 *)&regs->data[priv->sreg_proto_max];
+ range.min_proto.all = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg_proto_min]);
+ range.max_proto.all = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg_proto_max]);
}
- regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out);
+ regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range,
+ nft_out(pkt));
+}
+
+static void
+nft_masq_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV6);
}
static struct nft_expr_type nft_masq_ipv6_type;
@@ -41,6 +48,7 @@ static const struct nft_expr_ops nft_masq_ipv6_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
.eval = nft_masq_ipv6_eval,
.init = nft_masq_init,
+ .destroy = nft_masq_ipv6_destroy,
.dump = nft_masq_dump,
.validate = nft_masq_validate,
};
@@ -77,5 +85,5 @@ module_init(nft_masq_ipv6_module_init);
module_exit(nft_masq_ipv6_module_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "masq");
diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c
index aca44e89a881..a27e424f690d 100644
--- a/net/ipv6/netfilter/nft_redir_ipv6.c
+++ b/net/ipv6/netfilter/nft_redir_ipv6.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -26,16 +26,23 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr,
memset(&range, 0, sizeof(range));
if (priv->sreg_proto_min) {
- range.min_proto.all =
- *(__be16 *)&regs->data[priv->sreg_proto_min],
- range.max_proto.all =
- *(__be16 *)&regs->data[priv->sreg_proto_max],
+ range.min_proto.all = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg_proto_min]);
+ range.max_proto.all = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg_proto_max]);
range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
}
range.flags |= priv->flags;
- regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range, pkt->hook);
+ regs->verdict.code =
+ nf_nat_redirect_ipv6(pkt->skb, &range, nft_hook(pkt));
+}
+
+static void
+nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV6);
}
static struct nft_expr_type nft_redir_ipv6_type;
@@ -44,6 +51,7 @@ static const struct nft_expr_ops nft_redir_ipv6_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
.eval = nft_redir_ipv6_eval,
.init = nft_redir_init,
+ .destroy = nft_redir_ipv6_destroy,
.dump = nft_redir_dump,
.validate = nft_redir_validate,
};
@@ -71,5 +79,5 @@ module_init(nft_redir_ipv6_module_init);
module_exit(nft_redir_ipv6_module_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "redir");
diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c
index 92bda9908bb9..057deeaff1cb 100644
--- a/net/ipv6/netfilter/nft_reject_ipv6.c
+++ b/net/ipv6/netfilter/nft_reject_ipv6.c
@@ -27,11 +27,11 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr,
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nf_send_unreach6(pkt->net, pkt->skb, priv->icmp_code,
- pkt->hook);
+ nf_send_unreach6(nft_net(pkt), pkt->skb, priv->icmp_code,
+ nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset6(pkt->net, pkt->skb, pkt->hook);
+ nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt));
break;
default:
break;
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 66e2d9dfc43a..9b522fa90e6d 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -113,6 +113,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.daddr = *daddr;
fl6.flowi6_oif = oif;
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
fl6.fl6_icmp_type = user_icmph.icmp6_type;
fl6.fl6_icmp_code = user_icmph.icmp6_code;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
@@ -125,12 +126,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return PTR_ERR(dst);
rt = (struct rt6_info *) dst;
- np = inet6_sk(sk);
- if (!np) {
- err = -EBADF;
- goto dst_err_out;
- }
-
if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
fl6.flowi6_oif = np->mcast_oif;
else if (!fl6.flowi6_oif)
@@ -165,7 +160,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
release_sock(sk);
-dst_err_out:
dst_release(dst);
if (err)
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 054a1d84fc5e..f174e76e6505 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -65,11 +65,12 @@
#define ICMPV6_HDRLEN 4 /* ICMPv6 header, RFC 4443 Section 2.1 */
-static struct raw_hashinfo raw_v6_hashinfo = {
+struct raw_hashinfo raw_v6_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock),
};
+EXPORT_SYMBOL_GPL(raw_v6_hashinfo);
-static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
+struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
unsigned short num, const struct in6_addr *loc_addr,
const struct in6_addr *rmt_addr, int dif)
{
@@ -102,6 +103,7 @@ static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
found:
return sk;
}
+EXPORT_SYMBOL_GPL(__raw_v6_lookup);
/*
* 0 - deliver
@@ -589,7 +591,11 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
}
offset += skb_transport_offset(skb);
- BUG_ON(skb_copy_bits(skb, offset, &csum, 2));
+ err = skb_copy_bits(skb, offset, &csum, 2);
+ if (err < 0) {
+ ip6_flush_pending_frames(sk);
+ goto out;
+ }
/* in case cksum was not initialized */
if (unlikely(csum))
@@ -648,6 +654,9 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
skb->ip_summed = CHECKSUM_NONE;
+ if (flags & MSG_CONFIRM)
+ skb_set_dst_pending_confirm(skb, 1);
+
skb->transport_header = skb->network_header;
err = memcpy_from_msg(iph, msg, length);
if (err)
@@ -774,6 +783,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
ipc6.hlimit = -1;
ipc6.tclass = -1;
@@ -927,7 +937,8 @@ out:
txopt_put(opt_to_free);
return err < 0 ? err : len;
do_confirm:
- dst_confirm(dst);
+ if (msg->msg_flags & MSG_PROBE)
+ dst_confirm_neigh(dst, &fl6.daddr);
if (!(msg->msg_flags & MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
@@ -1259,6 +1270,7 @@ struct proto rawv6_prot = {
.compat_getsockopt = compat_rawv6_getsockopt,
.compat_ioctl = compat_rawv6_ioctl,
#endif
+ .diag_destroy = raw_abort,
};
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 3815e8505ed2..e1da5b888cc4 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -211,7 +211,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
{
struct sk_buff *prev, *next;
struct net_device *dev;
- int offset, end;
+ int offset, end, fragsize;
struct net *net = dev_net(skb_dst(skb)->dev);
u8 ecn;
@@ -336,6 +336,10 @@ found:
fq->ecn |= ecn;
add_frag_mem_limit(fq->q.net, skb->truesize);
+ fragsize = -skb_network_offset(skb) + skb->len;
+ if (fragsize > fq->q.max_size)
+ fq->q.max_size = fragsize;
+
/* The first fragment.
* nhoffset is obtained from the first fragment, of course.
*/
@@ -495,6 +499,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn);
IP6CB(head)->nhoff = nhoff;
IP6CB(head)->flags |= IP6SKB_FRAGMENTED;
+ IP6CB(head)->frag_max_size = fq->q.max_size;
/* Yes, and fold redundant checksum back. 8) */
skb_postpush_rcsum(head, skb_network_header(head),
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1b57e11e6e0d..fb174b590fd3 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -64,7 +64,7 @@
#include <net/l3mdev.h>
#include <trace/events/fib6.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
@@ -98,6 +98,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
static void rt6_dst_from_metrics_check(struct rt6_info *rt);
static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
+static size_t rt6_nlmsg_size(struct rt6_info *rt);
+static int rt6_fill_node(struct net *net,
+ struct sk_buff *skb, struct rt6_info *rt,
+ struct in6_addr *dst, struct in6_addr *src,
+ int iif, int type, u32 portid, u32 seq,
+ unsigned int flags);
#ifdef CONFIG_IPV6_ROUTE_INFO
static struct rt6_info *rt6_add_route_info(struct net *net,
@@ -217,6 +223,21 @@ static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
return neigh_create(&nd_tbl, daddr, dst->dev);
}
+static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+ struct net_device *dev = dst->dev;
+ struct rt6_info *rt = (struct rt6_info *)dst;
+
+ daddr = choose_neigh_daddr(rt, NULL, daddr);
+ if (!daddr)
+ return;
+ if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
+ return;
+ if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
+ return;
+ __ipv6_confirm_neigh(dev, daddr);
+}
+
static struct dst_ops ip6_dst_ops_template = {
.family = AF_INET6,
.gc = ip6_dst_gc,
@@ -233,6 +254,7 @@ static struct dst_ops ip6_dst_ops_template = {
.redirect = rt6_do_redirect,
.local_out = __ip6_local_out,
.neigh_lookup = ip6_neigh_lookup,
+ .confirm_neigh = ip6_confirm_neigh,
};
static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
@@ -527,7 +549,7 @@ static void rt6_probe_deferred(struct work_struct *w)
container_of(w, struct __rt6_probe_work, work);
addrconf_addr_solict_mult(&work->target, &mcaddr);
- ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
+ ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
dev_put(work->dev);
kfree(work);
}
@@ -1359,6 +1381,7 @@ static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
const struct ipv6hdr *iph, u32 mtu)
{
+ const struct in6_addr *daddr, *saddr;
struct rt6_info *rt6 = (struct rt6_info *)dst;
if (rt6->rt6i_flags & RTF_LOCAL)
@@ -1367,26 +1390,26 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
if (dst_metric_locked(dst, RTAX_MTU))
return;
- dst_confirm(dst);
+ if (iph) {
+ daddr = &iph->daddr;
+ saddr = &iph->saddr;
+ } else if (sk) {
+ daddr = &sk->sk_v6_daddr;
+ saddr = &inet6_sk(sk)->saddr;
+ } else {
+ daddr = NULL;
+ saddr = NULL;
+ }
+ dst_confirm_neigh(dst, daddr);
mtu = max_t(u32, mtu, IPV6_MIN_MTU);
if (mtu >= dst_mtu(dst))
return;
if (!rt6_cache_allowed_for_pmtu(rt6)) {
rt6_do_update_pmtu(rt6, mtu);
- } else {
- const struct in6_addr *daddr, *saddr;
+ } else if (daddr) {
struct rt6_info *nrt6;
- if (iph) {
- daddr = &iph->daddr;
- saddr = &iph->saddr;
- } else if (sk) {
- daddr = &sk->sk_v6_daddr;
- saddr = &inet6_sk(sk)->saddr;
- } else {
- return;
- }
nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
if (nrt6) {
rt6_do_update_pmtu(nrt6, mtu);
@@ -1408,7 +1431,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
}
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
- int oif, u32 mark)
+ int oif, u32 mark, kuid_t uid)
{
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
@@ -1420,6 +1443,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
fl6.daddr = iph->daddr;
fl6.saddr = iph->saddr;
fl6.flowlabel = ip6_flowinfo(iph);
+ fl6.flowi6_uid = uid;
dst = ip6_route_output(net, NULL, &fl6);
if (!dst->error)
@@ -1433,7 +1457,7 @@ void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
struct dst_entry *dst;
ip6_update_pmtu(skb, sock_net(sk), mtu,
- sk->sk_bound_dev_if, sk->sk_mark);
+ sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
dst = __sk_dst_get(sk);
if (!dst || !dst->obsolete ||
@@ -1463,7 +1487,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
struct fib6_node *fn;
/* Get the "current" route for this destination and
- * check if the redirect has come from approriate router.
+ * check if the redirect has come from appropriate router.
*
* RFC 4861 specifies that redirects should only be
* accepted if they come from the nexthop to the target.
@@ -1525,7 +1549,8 @@ static struct dst_entry *ip6_route_redirect(struct net *net,
flags, __ip6_route_redirect);
}
-void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
+void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
+ kuid_t uid)
{
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
@@ -1538,6 +1563,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
fl6.daddr = iph->daddr;
fl6.saddr = iph->saddr;
fl6.flowlabel = ip6_flowinfo(iph);
+ fl6.flowi6_uid = uid;
dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
rt6_do_redirect(dst, NULL, skb);
@@ -1559,6 +1585,7 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
fl6.flowi6_mark = mark;
fl6.daddr = msg->dest;
fl6.saddr = iph->daddr;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
dst = ip6_route_redirect(net, &fl6, &iph->saddr);
rt6_do_redirect(dst, NULL, skb);
@@ -1567,7 +1594,8 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
- ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
+ ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
+ sk->sk_uid);
}
EXPORT_SYMBOL_GPL(ip6_sk_redirect);
@@ -1826,6 +1854,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
int addr_type;
int err = -EINVAL;
+ /* RTF_PCPU is an internal flag; can not be set by userspace */
+ if (cfg->fc_flags & RTF_PCPU)
+ goto out;
+
if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
goto out;
#ifndef CONFIG_IPV6_SUBTREES
@@ -1892,7 +1924,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
if (cfg->fc_encap) {
struct lwtunnel_state *lwtstate;
- err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+ err = lwtunnel_build_state(cfg->fc_encap_type,
cfg->fc_encap, AF_INET6, cfg,
&lwtstate);
if (err)
@@ -1995,8 +2027,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
It is very good, but in some (rare!) circumstances
(SIT, PtP, NBMA NOARP links) it is handy to allow
some exceptions. --ANK
+ We allow IPv4-mapped nexthops to support RFC4798-type
+ addressing
*/
- if (!(gwa_type & IPV6_ADDR_UNICAST))
+ if (!(gwa_type & (IPV6_ADDR_UNICAST |
+ IPV6_ADDR_MAPPED)))
goto out;
if (cfg->fc_table) {
@@ -2135,6 +2170,58 @@ int ip6_del_rt(struct rt6_info *rt)
return __ip6_del_rt(rt, &info);
}
+static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
+{
+ struct nl_info *info = &cfg->fc_nlinfo;
+ struct net *net = info->nl_net;
+ struct sk_buff *skb = NULL;
+ struct fib6_table *table;
+ int err = -ENOENT;
+
+ if (rt == net->ipv6.ip6_null_entry)
+ goto out_put;
+ table = rt->rt6i_table;
+ write_lock_bh(&table->tb6_lock);
+
+ if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
+ struct rt6_info *sibling, *next_sibling;
+
+ /* prefer to send a single notification with all hops */
+ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
+ if (skb) {
+ u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+
+ if (rt6_fill_node(net, skb, rt,
+ NULL, NULL, 0, RTM_DELROUTE,
+ info->portid, seq, 0) < 0) {
+ kfree_skb(skb);
+ skb = NULL;
+ } else
+ info->skip_notify = 1;
+ }
+
+ list_for_each_entry_safe(sibling, next_sibling,
+ &rt->rt6i_siblings,
+ rt6i_siblings) {
+ err = fib6_del(sibling, info);
+ if (err)
+ goto out_unlock;
+ }
+ }
+
+ err = fib6_del(rt, info);
+out_unlock:
+ write_unlock_bh(&table->tb6_lock);
+out_put:
+ ip6_rt_put(rt);
+
+ if (skb) {
+ rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
+ info->nlh, gfp_any());
+ }
+ return err;
+}
+
static int ip6_route_del(struct fib6_config *cfg)
{
struct fib6_table *table;
@@ -2166,10 +2253,16 @@ static int ip6_route_del(struct fib6_config *cfg)
continue;
if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
continue;
+ if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
+ continue;
dst_hold(&rt->dst);
read_unlock_bh(&table->tb6_lock);
- return __ip6_del_rt(rt, &cfg->fc_nlinfo);
+ /* if gateway was specified only delete the one hop */
+ if (cfg->fc_flags & RTF_GATEWAY)
+ return __ip6_del_rt(rt, &cfg->fc_nlinfo);
+
+ return __ip6_del_rt_siblings(rt, cfg);
}
}
read_unlock_bh(&table->tb6_lock);
@@ -2248,7 +2341,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
* Look, redirects are sent only in response to data packets,
* so that this nexthop apparently is reachable. --ANK
*/
- dst_confirm(&rt->dst);
+ dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
if (!neigh)
@@ -2624,6 +2717,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
rt->dst.output = ip6_output;
rt->rt6i_idev = idev;
+ rt->rt6i_protocol = RTPROT_KERNEL;
rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
if (anycast)
rt->rt6i_flags |= RTF_ANYCAST;
@@ -2701,13 +2795,16 @@ struct arg_dev_net {
struct net *net;
};
+/* called with write lock held for table with rt */
static int fib6_ifdown(struct rt6_info *rt, void *arg)
{
const struct arg_dev_net *adn = arg;
const struct net_device *dev = adn->dev;
if ((rt->dst.dev == dev || !dev) &&
- rt != adn->net->ipv6.ip6_null_entry)
+ rt != adn->net->ipv6.ip6_null_entry &&
+ (rt->rt6i_nsiblings == 0 ||
+ !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
return -1;
return 0;
@@ -2758,7 +2855,7 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
old MTU is the lowest MTU in the path, update the route PMTU
to reflect the increase. In this case if the other nodes' MTU
also have the lowest MTU, TOO BIG MESSAGE will be lead to
- PMTU discouvery.
+ PMTU discovery.
*/
if (rt->dst.dev == arg->dev &&
dst_metric_raw(&rt->dst, RTAX_MTU) &&
@@ -2801,6 +2898,8 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_ENCAP_TYPE] = { .type = NLA_U16 },
[RTA_ENCAP] = { .type = NLA_NESTED },
[RTA_EXPIRES] = { .type = NLA_U32 },
+ [RTA_UID] = { .type = NLA_U32 },
+ [RTA_MARK] = { .type = NLA_U32 },
};
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2885,6 +2984,11 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[RTA_MULTIPATH]) {
cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
+
+ err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
+ cfg->fc_mp_len);
+ if (err < 0)
+ goto errout;
}
if (tb[RTA_PREF]) {
@@ -2898,9 +3002,14 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[RTA_ENCAP])
cfg->fc_encap = tb[RTA_ENCAP];
- if (tb[RTA_ENCAP_TYPE])
+ if (tb[RTA_ENCAP_TYPE]) {
cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
+ err = lwtunnel_valid_encap_type(cfg->fc_encap_type);
+ if (err < 0)
+ goto errout;
+ }
+
if (tb[RTA_EXPIRES]) {
unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
@@ -2927,7 +3036,7 @@ static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
struct rt6_nh *nh;
list_for_each_entry(nh, rt6_nh_list, next) {
- pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
+ pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
&nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
nh->r_cfg.fc_ifindex);
}
@@ -2966,13 +3075,37 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list,
return 0;
}
+static void ip6_route_mpath_notify(struct rt6_info *rt,
+ struct rt6_info *rt_last,
+ struct nl_info *info,
+ __u16 nlflags)
+{
+ /* if this is an APPEND route, then rt points to the first route
+ * inserted and rt_last points to last route inserted. Userspace
+ * wants a consistent dump of the route which starts at the first
+ * nexthop. Since sibling routes are always added at the end of
+ * the list, find the first sibling of the last route appended
+ */
+ if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
+ rt = list_first_entry(&rt_last->rt6i_siblings,
+ struct rt6_info,
+ rt6i_siblings);
+ }
+
+ if (rt)
+ inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
+}
+
static int ip6_route_multipath_add(struct fib6_config *cfg)
{
+ struct rt6_info *rt_notif = NULL, *rt_last = NULL;
+ struct nl_info *info = &cfg->fc_nlinfo;
struct fib6_config r_cfg;
struct rtnexthop *rtnh;
struct rt6_info *rt;
struct rt6_nh *err_nh;
struct rt6_nh *nh, *nh_safe;
+ __u16 nlflags;
int remaining;
int attrlen;
int err = 1;
@@ -2981,6 +3114,10 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
LIST_HEAD(rt6_nh_list);
+ nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
+ if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
+ nlflags |= NLM_F_APPEND;
+
remaining = cfg->fc_mp_len;
rtnh = (struct rtnexthop *)cfg->fc_mp;
@@ -3023,9 +3160,20 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
rtnh = rtnh_next(rtnh, &remaining);
}
+ /* for add and replace send one notification with all nexthops.
+ * Skip the notification in fib6_add_rt2node and send one with
+ * the full route when done
+ */
+ info->skip_notify = 1;
+
err_nh = NULL;
list_for_each_entry(nh, &rt6_nh_list, next) {
- err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
+ rt_last = nh->rt6_info;
+ err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc);
+ /* save reference to first route for notification */
+ if (!rt_notif && !err)
+ rt_notif = nh->rt6_info;
+
/* nh->rt6_info is used or freed at this point, reset to NULL*/
nh->rt6_info = NULL;
if (err) {
@@ -3047,9 +3195,18 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
nhn++;
}
+ /* success ... tell user about new route */
+ ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
goto cleanup;
add_errout:
+ /* send notification for routes that were added so that
+ * the delete notifications sent by ip6_route_del are
+ * coherent
+ */
+ if (rt_notif)
+ ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
+
/* Delete routes that were already added */
list_for_each_entry(nh, &rt6_nh_list, next) {
if (err_nh == nh)
@@ -3117,8 +3274,10 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
if (cfg.fc_mp)
return ip6_route_multipath_del(&cfg);
- else
+ else {
+ cfg.fc_delete_all_nh = 1;
return ip6_route_del(&cfg);
+ }
}
static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
@@ -3136,8 +3295,19 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
return ip6_route_add(&cfg);
}
-static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
+static size_t rt6_nlmsg_size(struct rt6_info *rt)
{
+ int nexthop_len = 0;
+
+ if (rt->rt6i_nsiblings) {
+ nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
+ + NLA_ALIGN(sizeof(struct rtnexthop))
+ + nla_total_size(16) /* RTA_GATEWAY */
+ + lwtunnel_get_encap_size(rt->dst.lwtstate);
+
+ nexthop_len *= rt->rt6i_nsiblings;
+ }
+
return NLMSG_ALIGN(sizeof(struct rtmsg))
+ nla_total_size(16) /* RTA_SRC */
+ nla_total_size(16) /* RTA_DST */
@@ -3151,14 +3321,71 @@ static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
+ nla_total_size(sizeof(struct rta_cacheinfo))
+ nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
+ nla_total_size(1) /* RTA_PREF */
- + lwtunnel_get_encap_size(rt->dst.lwtstate);
+ + lwtunnel_get_encap_size(rt->dst.lwtstate)
+ + nexthop_len;
+}
+
+static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
+ unsigned int *flags, bool skip_oif)
+{
+ if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
+ *flags |= RTNH_F_LINKDOWN;
+ if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
+ *flags |= RTNH_F_DEAD;
+ }
+
+ if (rt->rt6i_flags & RTF_GATEWAY) {
+ if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
+ goto nla_put_failure;
+ }
+
+ /* not needed for multipath encoding b/c it has a rtnexthop struct */
+ if (!skip_oif && rt->dst.dev &&
+ nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
+ goto nla_put_failure;
+
+ if (rt->dst.lwtstate &&
+ lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+/* add multipath next hop */
+static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
+{
+ struct rtnexthop *rtnh;
+ unsigned int flags = 0;
+
+ rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
+ if (!rtnh)
+ goto nla_put_failure;
+
+ rtnh->rtnh_hops = 0;
+ rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
+
+ if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
+ goto nla_put_failure;
+
+ rtnh->rtnh_flags = flags;
+
+ /* length of rtnetlink header + attributes */
+ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
}
static int rt6_fill_node(struct net *net,
struct sk_buff *skb, struct rt6_info *rt,
struct in6_addr *dst, struct in6_addr *src,
int iif, int type, u32 portid, u32 seq,
- int prefix, int nowait, unsigned int flags)
+ unsigned int flags)
{
u32 metrics[RTAX_MAX];
struct rtmsg *rtm;
@@ -3166,13 +3393,6 @@ static int rt6_fill_node(struct net *net,
long expires;
u32 table;
- if (prefix) { /* user wants prefix routes only */
- if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
- /* success since this is not a prefix route */
- return 1;
- }
- }
-
nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
if (!nlh)
return -EMSGSIZE;
@@ -3207,16 +3427,13 @@ static int rt6_fill_node(struct net *net,
}
else if (rt->rt6i_flags & RTF_LOCAL)
rtm->rtm_type = RTN_LOCAL;
+ else if (rt->rt6i_flags & RTF_ANYCAST)
+ rtm->rtm_type = RTN_ANYCAST;
else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
rtm->rtm_type = RTN_LOCAL;
else
rtm->rtm_type = RTN_UNICAST;
rtm->rtm_flags = 0;
- if (!netif_carrier_ok(rt->dst.dev)) {
- rtm->rtm_flags |= RTNH_F_LINKDOWN;
- if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
- rtm->rtm_flags |= RTNH_F_DEAD;
- }
rtm->rtm_scope = RT_SCOPE_UNIVERSE;
rtm->rtm_protocol = rt->rt6i_protocol;
if (rt->rt6i_flags & RTF_DYNAMIC)
@@ -3250,19 +3467,12 @@ static int rt6_fill_node(struct net *net,
if (iif) {
#ifdef CONFIG_IPV6_MROUTE
if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
- int err = ip6mr_get_route(net, skb, rtm, nowait,
- portid);
-
- if (err <= 0) {
- if (!nowait) {
- if (err == 0)
- return 0;
- goto nla_put_failure;
- } else {
- if (err == -EMSGSIZE)
- goto nla_put_failure;
- }
- }
+ int err = ip6mr_get_route(net, skb, rtm, portid);
+
+ if (err == 0)
+ return 0;
+ if (err < 0)
+ goto nla_put_failure;
} else
#endif
if (nla_put_u32(skb, RTA_IIF, iif))
@@ -3287,17 +3497,35 @@ static int rt6_fill_node(struct net *net,
if (rtnetlink_put_metrics(skb, metrics) < 0)
goto nla_put_failure;
- if (rt->rt6i_flags & RTF_GATEWAY) {
- if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
- goto nla_put_failure;
- }
-
- if (rt->dst.dev &&
- nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
- goto nla_put_failure;
if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
goto nla_put_failure;
+ /* For multipath routes, walk the siblings list and add
+ * each as a nexthop within RTA_MULTIPATH.
+ */
+ if (rt->rt6i_nsiblings) {
+ struct rt6_info *sibling, *next_sibling;
+ struct nlattr *mp;
+
+ mp = nla_nest_start(skb, RTA_MULTIPATH);
+ if (!mp)
+ goto nla_put_failure;
+
+ if (rt6_add_nexthop(skb, rt) < 0)
+ goto nla_put_failure;
+
+ list_for_each_entry_safe(sibling, next_sibling,
+ &rt->rt6i_siblings, rt6i_siblings) {
+ if (rt6_add_nexthop(skb, sibling) < 0)
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, mp);
+ } else {
+ if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
+ goto nla_put_failure;
+ }
+
expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
@@ -3306,7 +3534,6 @@ static int rt6_fill_node(struct net *net,
if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
goto nla_put_failure;
- lwtunnel_fill_encap(skb, rt->dst.lwtstate);
nlmsg_end(skb, nlh);
return 0;
@@ -3319,18 +3546,26 @@ nla_put_failure:
int rt6_dump_route(struct rt6_info *rt, void *p_arg)
{
struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
- int prefix;
+ struct net *net = arg->net;
+
+ if (rt == net->ipv6.ip6_null_entry)
+ return 0;
if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
- prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
- } else
- prefix = 0;
- return rt6_fill_node(arg->net,
+ /* user wants prefix routes only */
+ if (rtm->rtm_flags & RTM_F_PREFIX &&
+ !(rt->rt6i_flags & RTF_PREFIX_RT)) {
+ /* success since this is not a prefix route */
+ return 1;
+ }
+ }
+
+ return rt6_fill_node(net,
arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
- prefix, 0, NLM_F_MULTI);
+ NLM_F_MULTI);
}
static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
@@ -3375,6 +3610,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
if (tb[RTA_MARK])
fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
+ if (tb[RTA_UID])
+ fl6.flowi6_uid = make_kuid(current_user_ns(),
+ nla_get_u32(tb[RTA_UID]));
+ else
+ fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
+
if (iif) {
struct net_device *dev;
int flags = 0;
@@ -3398,6 +3639,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
}
+ if (rt == net->ipv6.ip6_null_entry) {
+ err = rt->dst.error;
+ ip6_rt_put(rt);
+ goto errout;
+ }
+
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb) {
ip6_rt_put(rt);
@@ -3405,17 +3652,11 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
goto errout;
}
- /* Reserve room for dummy headers, this skb can pass
- through good chunk of routing engine.
- */
- skb_reset_mac_header(skb);
- skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
-
skb_dst_set(skb, &rt->dst);
err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, 0, 0);
+ nlh->nlmsg_seq, 0);
if (err < 0) {
kfree_skb(skb);
goto errout;
@@ -3442,7 +3683,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
goto errout;
err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
- event, info->portid, seq, 0, 0, nlm_flags);
+ event, info->portid, seq, nlm_flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
new file mode 100644
index 000000000000..5f44ffed2576
--- /dev/null
+++ b/net/ipv6/seg6.c
@@ -0,0 +1,500 @@
+/*
+ * SR-IPv6 implementation
+ *
+ * Author:
+ * David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+
+#include <net/seg6.h>
+#include <net/genetlink.h>
+#include <linux/seg6.h>
+#include <linux/seg6_genl.h>
+#ifdef CONFIG_IPV6_SEG6_HMAC
+#include <net/seg6_hmac.h>
+#endif
+
+bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len)
+{
+ int trailing;
+ unsigned int tlv_offset;
+
+ if (srh->type != IPV6_SRCRT_TYPE_4)
+ return false;
+
+ if (((srh->hdrlen + 1) << 3) != len)
+ return false;
+
+ if (srh->segments_left != srh->first_segment)
+ return false;
+
+ tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4);
+
+ trailing = len - tlv_offset;
+ if (trailing < 0)
+ return false;
+
+ while (trailing) {
+ struct sr6_tlv *tlv;
+ unsigned int tlv_len;
+
+ if (trailing < sizeof(*tlv))
+ return false;
+
+ tlv = (struct sr6_tlv *)((unsigned char *)srh + tlv_offset);
+ tlv_len = sizeof(*tlv) + tlv->len;
+
+ trailing -= tlv_len;
+ if (trailing < 0)
+ return false;
+
+ tlv_offset += tlv_len;
+ }
+
+ return true;
+}
+
+static struct genl_family seg6_genl_family;
+
+static const struct nla_policy seg6_genl_policy[SEG6_ATTR_MAX + 1] = {
+ [SEG6_ATTR_DST] = { .type = NLA_BINARY,
+ .len = sizeof(struct in6_addr) },
+ [SEG6_ATTR_DSTLEN] = { .type = NLA_S32, },
+ [SEG6_ATTR_HMACKEYID] = { .type = NLA_U32, },
+ [SEG6_ATTR_SECRET] = { .type = NLA_BINARY, },
+ [SEG6_ATTR_SECRETLEN] = { .type = NLA_U8, },
+ [SEG6_ATTR_ALGID] = { .type = NLA_U8, },
+ [SEG6_ATTR_HMACINFO] = { .type = NLA_NESTED, },
+};
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+
+static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct seg6_pernet_data *sdata;
+ struct seg6_hmac_info *hinfo;
+ u32 hmackeyid;
+ char *secret;
+ int err = 0;
+ u8 algid;
+ u8 slen;
+
+ sdata = seg6_pernet(net);
+
+ if (!info->attrs[SEG6_ATTR_HMACKEYID] ||
+ !info->attrs[SEG6_ATTR_SECRETLEN] ||
+ !info->attrs[SEG6_ATTR_ALGID])
+ return -EINVAL;
+
+ hmackeyid = nla_get_u32(info->attrs[SEG6_ATTR_HMACKEYID]);
+ slen = nla_get_u8(info->attrs[SEG6_ATTR_SECRETLEN]);
+ algid = nla_get_u8(info->attrs[SEG6_ATTR_ALGID]);
+
+ if (hmackeyid == 0)
+ return -EINVAL;
+
+ if (slen > SEG6_HMAC_SECRET_LEN)
+ return -EINVAL;
+
+ mutex_lock(&sdata->lock);
+ hinfo = seg6_hmac_info_lookup(net, hmackeyid);
+
+ if (!slen) {
+ if (!hinfo)
+ err = -ENOENT;
+
+ err = seg6_hmac_info_del(net, hmackeyid);
+
+ goto out_unlock;
+ }
+
+ if (!info->attrs[SEG6_ATTR_SECRET]) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (hinfo) {
+ err = seg6_hmac_info_del(net, hmackeyid);
+ if (err)
+ goto out_unlock;
+ }
+
+ secret = (char *)nla_data(info->attrs[SEG6_ATTR_SECRET]);
+
+ hinfo = kzalloc(sizeof(*hinfo), GFP_KERNEL);
+ if (!hinfo) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ memcpy(hinfo->secret, secret, slen);
+ hinfo->slen = slen;
+ hinfo->alg_id = algid;
+ hinfo->hmackeyid = hmackeyid;
+
+ err = seg6_hmac_info_add(net, hmackeyid, hinfo);
+ if (err)
+ kfree(hinfo);
+
+out_unlock:
+ mutex_unlock(&sdata->lock);
+ return err;
+}
+
+#else
+
+static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
+{
+ return -ENOTSUPP;
+}
+
+#endif
+
+static int seg6_genl_set_tunsrc(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct in6_addr *val, *t_old, *t_new;
+ struct seg6_pernet_data *sdata;
+
+ sdata = seg6_pernet(net);
+
+ if (!info->attrs[SEG6_ATTR_DST])
+ return -EINVAL;
+
+ val = nla_data(info->attrs[SEG6_ATTR_DST]);
+ t_new = kmemdup(val, sizeof(*val), GFP_KERNEL);
+ if (!t_new)
+ return -ENOMEM;
+
+ mutex_lock(&sdata->lock);
+
+ t_old = sdata->tun_src;
+ rcu_assign_pointer(sdata->tun_src, t_new);
+
+ mutex_unlock(&sdata->lock);
+
+ synchronize_net();
+ kfree(t_old);
+
+ return 0;
+}
+
+static int seg6_genl_get_tunsrc(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct in6_addr *tun_src;
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &seg6_genl_family, 0, SEG6_CMD_GET_TUNSRC);
+ if (!hdr)
+ goto free_msg;
+
+ rcu_read_lock();
+ tun_src = rcu_dereference(seg6_pernet(net)->tun_src);
+
+ if (nla_put(msg, SEG6_ATTR_DST, sizeof(struct in6_addr), tun_src))
+ goto nla_put_failure;
+
+ rcu_read_unlock();
+
+ genlmsg_end(msg, hdr);
+ genlmsg_reply(msg, info);
+
+ return 0;
+
+nla_put_failure:
+ rcu_read_unlock();
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -ENOMEM;
+}
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+
+static int __seg6_hmac_fill_info(struct seg6_hmac_info *hinfo,
+ struct sk_buff *msg)
+{
+ if (nla_put_u32(msg, SEG6_ATTR_HMACKEYID, hinfo->hmackeyid) ||
+ nla_put_u8(msg, SEG6_ATTR_SECRETLEN, hinfo->slen) ||
+ nla_put(msg, SEG6_ATTR_SECRET, hinfo->slen, hinfo->secret) ||
+ nla_put_u8(msg, SEG6_ATTR_ALGID, hinfo->alg_id))
+ return -1;
+
+ return 0;
+}
+
+static int __seg6_genl_dumphmac_element(struct seg6_hmac_info *hinfo,
+ u32 portid, u32 seq, u32 flags,
+ struct sk_buff *skb, u8 cmd)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &seg6_genl_family, flags, cmd);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (__seg6_hmac_fill_info(hinfo, skb) < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int seg6_genl_dumphmac_start(struct netlink_callback *cb)
+{
+ struct net *net = sock_net(cb->skb->sk);
+ struct seg6_pernet_data *sdata;
+ struct rhashtable_iter *iter;
+
+ sdata = seg6_pernet(net);
+ iter = (struct rhashtable_iter *)cb->args[0];
+
+ if (!iter) {
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ cb->args[0] = (long)iter;
+ }
+
+ rhashtable_walk_enter(&sdata->hmac_infos, iter);
+
+ return 0;
+}
+
+static int seg6_genl_dumphmac_done(struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+ rhashtable_walk_exit(iter);
+
+ kfree(iter);
+
+ return 0;
+}
+
+static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ struct seg6_pernet_data *sdata;
+ struct seg6_hmac_info *hinfo;
+ int ret;
+
+ sdata = seg6_pernet(net);
+
+ ret = rhashtable_walk_start(iter);
+ if (ret && ret != -EAGAIN)
+ goto done;
+
+ for (;;) {
+ hinfo = rhashtable_walk_next(iter);
+
+ if (IS_ERR(hinfo)) {
+ if (PTR_ERR(hinfo) == -EAGAIN)
+ continue;
+ ret = PTR_ERR(hinfo);
+ goto done;
+ } else if (!hinfo) {
+ break;
+ }
+
+ ret = __seg6_genl_dumphmac_element(hinfo,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ skb, SEG6_CMD_DUMPHMAC);
+ if (ret)
+ goto done;
+ }
+
+ ret = skb->len;
+
+done:
+ rhashtable_walk_stop(iter);
+ return ret;
+}
+
+#else
+
+static int seg6_genl_dumphmac_start(struct netlink_callback *cb)
+{
+ return 0;
+}
+
+static int seg6_genl_dumphmac_done(struct netlink_callback *cb)
+{
+ return 0;
+}
+
+static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return -ENOTSUPP;
+}
+
+#endif
+
+static int __net_init seg6_net_init(struct net *net)
+{
+ struct seg6_pernet_data *sdata;
+
+ sdata = kzalloc(sizeof(*sdata), GFP_KERNEL);
+ if (!sdata)
+ return -ENOMEM;
+
+ mutex_init(&sdata->lock);
+
+ sdata->tun_src = kzalloc(sizeof(*sdata->tun_src), GFP_KERNEL);
+ if (!sdata->tun_src) {
+ kfree(sdata);
+ return -ENOMEM;
+ }
+
+ net->ipv6.seg6_data = sdata;
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ seg6_hmac_net_init(net);
+#endif
+
+ return 0;
+}
+
+static void __net_exit seg6_net_exit(struct net *net)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ seg6_hmac_net_exit(net);
+#endif
+
+ kfree(sdata->tun_src);
+ kfree(sdata);
+}
+
+static struct pernet_operations ip6_segments_ops = {
+ .init = seg6_net_init,
+ .exit = seg6_net_exit,
+};
+
+static const struct genl_ops seg6_genl_ops[] = {
+ {
+ .cmd = SEG6_CMD_SETHMAC,
+ .doit = seg6_genl_sethmac,
+ .policy = seg6_genl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = SEG6_CMD_DUMPHMAC,
+ .start = seg6_genl_dumphmac_start,
+ .dumpit = seg6_genl_dumphmac,
+ .done = seg6_genl_dumphmac_done,
+ .policy = seg6_genl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = SEG6_CMD_SET_TUNSRC,
+ .doit = seg6_genl_set_tunsrc,
+ .policy = seg6_genl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = SEG6_CMD_GET_TUNSRC,
+ .doit = seg6_genl_get_tunsrc,
+ .policy = seg6_genl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+};
+
+static struct genl_family seg6_genl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = SEG6_GENL_NAME,
+ .version = SEG6_GENL_VERSION,
+ .maxattr = SEG6_ATTR_MAX,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = seg6_genl_ops,
+ .n_ops = ARRAY_SIZE(seg6_genl_ops),
+ .module = THIS_MODULE,
+};
+
+int __init seg6_init(void)
+{
+ int err = -ENOMEM;
+
+ err = genl_register_family(&seg6_genl_family);
+ if (err)
+ goto out;
+
+ err = register_pernet_subsys(&ip6_segments_ops);
+ if (err)
+ goto out_unregister_genl;
+
+#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
+ err = seg6_iptunnel_init();
+ if (err)
+ goto out_unregister_pernet;
+#endif
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ err = seg6_hmac_init();
+ if (err)
+ goto out_unregister_iptun;
+#endif
+
+ pr_info("Segment Routing with IPv6\n");
+
+out:
+ return err;
+#ifdef CONFIG_IPV6_SEG6_HMAC
+out_unregister_iptun:
+#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
+ seg6_iptunnel_exit();
+#endif
+#endif
+#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
+out_unregister_pernet:
+ unregister_pernet_subsys(&ip6_segments_ops);
+#endif
+out_unregister_genl:
+ genl_unregister_family(&seg6_genl_family);
+ goto out;
+}
+
+void seg6_exit(void)
+{
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ seg6_hmac_exit();
+#endif
+#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
+ seg6_iptunnel_exit();
+#endif
+ unregister_pernet_subsys(&ip6_segments_ops);
+ genl_unregister_family(&seg6_genl_family);
+}
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
new file mode 100644
index 000000000000..f950cb53d5e3
--- /dev/null
+++ b/net/ipv6/seg6_hmac.c
@@ -0,0 +1,448 @@
+/*
+ * SR-IPv6 implementation -- HMAC functions
+ *
+ * Author:
+ * David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/icmpv6.h>
+#include <linux/mroute6.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/rawv6.h>
+#include <net/ndisc.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/xfrm.h>
+
+#include <linux/cryptohash.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <net/seg6.h>
+#include <net/genetlink.h>
+#include <net/seg6_hmac.h>
+#include <linux/random.h>
+
+static DEFINE_PER_CPU(char [SEG6_HMAC_RING_SIZE], hmac_ring);
+
+static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+ const struct seg6_hmac_info *hinfo = obj;
+
+ return (hinfo->hmackeyid != *(__u32 *)arg->key);
+}
+
+static inline void seg6_hinfo_release(struct seg6_hmac_info *hinfo)
+{
+ kfree_rcu(hinfo, rcu);
+}
+
+static void seg6_free_hi(void *ptr, void *arg)
+{
+ struct seg6_hmac_info *hinfo = (struct seg6_hmac_info *)ptr;
+
+ if (hinfo)
+ seg6_hinfo_release(hinfo);
+}
+
+static const struct rhashtable_params rht_params = {
+ .head_offset = offsetof(struct seg6_hmac_info, node),
+ .key_offset = offsetof(struct seg6_hmac_info, hmackeyid),
+ .key_len = sizeof(u32),
+ .automatic_shrinking = true,
+ .obj_cmpfn = seg6_hmac_cmpfn,
+};
+
+static struct seg6_hmac_algo hmac_algos[] = {
+ {
+ .alg_id = SEG6_HMAC_ALGO_SHA1,
+ .name = "hmac(sha1)",
+ },
+ {
+ .alg_id = SEG6_HMAC_ALGO_SHA256,
+ .name = "hmac(sha256)",
+ },
+};
+
+static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh)
+{
+ struct sr6_tlv_hmac *tlv;
+
+ if (srh->hdrlen < (srh->first_segment + 1) * 2 + 5)
+ return NULL;
+
+ if (!sr_has_hmac(srh))
+ return NULL;
+
+ tlv = (struct sr6_tlv_hmac *)
+ ((char *)srh + ((srh->hdrlen + 1) << 3) - 40);
+
+ if (tlv->tlvhdr.type != SR6_TLV_HMAC || tlv->tlvhdr.len != 38)
+ return NULL;
+
+ return tlv;
+}
+
+static struct seg6_hmac_algo *__hmac_get_algo(u8 alg_id)
+{
+ struct seg6_hmac_algo *algo;
+ int i, alg_count;
+
+ alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
+ for (i = 0; i < alg_count; i++) {
+ algo = &hmac_algos[i];
+ if (algo->alg_id == alg_id)
+ return algo;
+ }
+
+ return NULL;
+}
+
+static int __do_hmac(struct seg6_hmac_info *hinfo, const char *text, u8 psize,
+ u8 *output, int outlen)
+{
+ struct seg6_hmac_algo *algo;
+ struct crypto_shash *tfm;
+ struct shash_desc *shash;
+ int ret, dgsize;
+
+ algo = __hmac_get_algo(hinfo->alg_id);
+ if (!algo)
+ return -ENOENT;
+
+ tfm = *this_cpu_ptr(algo->tfms);
+
+ dgsize = crypto_shash_digestsize(tfm);
+ if (dgsize > outlen) {
+ pr_debug("sr-ipv6: __do_hmac: digest size too big (%d / %d)\n",
+ dgsize, outlen);
+ return -ENOMEM;
+ }
+
+ ret = crypto_shash_setkey(tfm, hinfo->secret, hinfo->slen);
+ if (ret < 0) {
+ pr_debug("sr-ipv6: crypto_shash_setkey failed: err %d\n", ret);
+ goto failed;
+ }
+
+ shash = *this_cpu_ptr(algo->shashs);
+ shash->tfm = tfm;
+
+ ret = crypto_shash_digest(shash, text, psize, output);
+ if (ret < 0) {
+ pr_debug("sr-ipv6: crypto_shash_digest failed: err %d\n", ret);
+ goto failed;
+ }
+
+ return dgsize;
+
+failed:
+ return ret;
+}
+
+int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
+ struct in6_addr *saddr, u8 *output)
+{
+ __be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid);
+ u8 tmp_out[SEG6_HMAC_MAX_DIGESTSIZE];
+ int plen, i, dgsize, wrsize;
+ char *ring, *off;
+
+ /* a 160-byte buffer for digest output allows to store highest known
+ * hash function (RadioGatun) with up to 1216 bits
+ */
+
+ /* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */
+ plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16;
+
+ /* this limit allows for 14 segments */
+ if (plen >= SEG6_HMAC_RING_SIZE)
+ return -EMSGSIZE;
+
+ /* Let's build the HMAC text on the ring buffer. The text is composed
+ * as follows, in order:
+ *
+ * 1. Source IPv6 address (128 bits)
+ * 2. first_segment value (8 bits)
+ * 3. Flags (8 bits)
+ * 4. HMAC Key ID (32 bits)
+ * 5. All segments in the segments list (n * 128 bits)
+ */
+
+ local_bh_disable();
+ ring = this_cpu_ptr(hmac_ring);
+ off = ring;
+
+ /* source address */
+ memcpy(off, saddr, 16);
+ off += 16;
+
+ /* first_segment value */
+ *off++ = hdr->first_segment;
+
+ /* flags */
+ *off++ = hdr->flags;
+
+ /* HMAC Key ID */
+ memcpy(off, &hmackeyid, 4);
+ off += 4;
+
+ /* all segments in the list */
+ for (i = 0; i < hdr->first_segment + 1; i++) {
+ memcpy(off, hdr->segments + i, 16);
+ off += 16;
+ }
+
+ dgsize = __do_hmac(hinfo, ring, plen, tmp_out,
+ SEG6_HMAC_MAX_DIGESTSIZE);
+ local_bh_enable();
+
+ if (dgsize < 0)
+ return dgsize;
+
+ wrsize = SEG6_HMAC_FIELD_LEN;
+ if (wrsize > dgsize)
+ wrsize = dgsize;
+
+ memset(output, 0, SEG6_HMAC_FIELD_LEN);
+ memcpy(output, tmp_out, wrsize);
+
+ return 0;
+}
+EXPORT_SYMBOL(seg6_hmac_compute);
+
+/* checks if an incoming SR-enabled packet's HMAC status matches
+ * the incoming policy.
+ *
+ * called with rcu_read_lock()
+ */
+bool seg6_hmac_validate_skb(struct sk_buff *skb)
+{
+ u8 hmac_output[SEG6_HMAC_FIELD_LEN];
+ struct net *net = dev_net(skb->dev);
+ struct seg6_hmac_info *hinfo;
+ struct sr6_tlv_hmac *tlv;
+ struct ipv6_sr_hdr *srh;
+ struct inet6_dev *idev;
+
+ idev = __in6_dev_get(skb->dev);
+
+ srh = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+
+ tlv = seg6_get_tlv_hmac(srh);
+
+ /* mandatory check but no tlv */
+ if (idev->cnf.seg6_require_hmac > 0 && !tlv)
+ return false;
+
+ /* no check */
+ if (idev->cnf.seg6_require_hmac < 0)
+ return true;
+
+ /* check only if present */
+ if (idev->cnf.seg6_require_hmac == 0 && !tlv)
+ return true;
+
+ /* now, seg6_require_hmac >= 0 && tlv */
+
+ hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid));
+ if (!hinfo)
+ return false;
+
+ if (seg6_hmac_compute(hinfo, srh, &ipv6_hdr(skb)->saddr, hmac_output))
+ return false;
+
+ if (memcmp(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN) != 0)
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL(seg6_hmac_validate_skb);
+
+/* called with rcu_read_lock() */
+struct seg6_hmac_info *seg6_hmac_info_lookup(struct net *net, u32 key)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+ struct seg6_hmac_info *hinfo;
+
+ hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params);
+
+ return hinfo;
+}
+EXPORT_SYMBOL(seg6_hmac_info_lookup);
+
+int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+ int err;
+
+ err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node,
+ rht_params);
+
+ return err;
+}
+EXPORT_SYMBOL(seg6_hmac_info_add);
+
+int seg6_hmac_info_del(struct net *net, u32 key)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+ struct seg6_hmac_info *hinfo;
+ int err = -ENOENT;
+
+ hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params);
+ if (!hinfo)
+ goto out;
+
+ err = rhashtable_remove_fast(&sdata->hmac_infos, &hinfo->node,
+ rht_params);
+ if (err)
+ goto out;
+
+ seg6_hinfo_release(hinfo);
+
+out:
+ return err;
+}
+EXPORT_SYMBOL(seg6_hmac_info_del);
+
+int seg6_push_hmac(struct net *net, struct in6_addr *saddr,
+ struct ipv6_sr_hdr *srh)
+{
+ struct seg6_hmac_info *hinfo;
+ struct sr6_tlv_hmac *tlv;
+ int err = -ENOENT;
+
+ tlv = seg6_get_tlv_hmac(srh);
+ if (!tlv)
+ return -EINVAL;
+
+ rcu_read_lock();
+
+ hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid));
+ if (!hinfo)
+ goto out;
+
+ memset(tlv->hmac, 0, SEG6_HMAC_FIELD_LEN);
+ err = seg6_hmac_compute(hinfo, srh, saddr, tlv->hmac);
+
+out:
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(seg6_push_hmac);
+
+static int seg6_hmac_init_algo(void)
+{
+ struct seg6_hmac_algo *algo;
+ struct crypto_shash *tfm;
+ struct shash_desc *shash;
+ int i, alg_count, cpu;
+
+ alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
+
+ for (i = 0; i < alg_count; i++) {
+ struct crypto_shash **p_tfm;
+ int shsize;
+
+ algo = &hmac_algos[i];
+ algo->tfms = alloc_percpu(struct crypto_shash *);
+ if (!algo->tfms)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ tfm = crypto_alloc_shash(algo->name, 0, GFP_KERNEL);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+ p_tfm = per_cpu_ptr(algo->tfms, cpu);
+ *p_tfm = tfm;
+ }
+
+ p_tfm = raw_cpu_ptr(algo->tfms);
+ tfm = *p_tfm;
+
+ shsize = sizeof(*shash) + crypto_shash_descsize(tfm);
+
+ algo->shashs = alloc_percpu(struct shash_desc *);
+ if (!algo->shashs)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ shash = kzalloc_node(shsize, GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!shash)
+ return -ENOMEM;
+ *per_cpu_ptr(algo->shashs, cpu) = shash;
+ }
+ }
+
+ return 0;
+}
+
+int __init seg6_hmac_init(void)
+{
+ return seg6_hmac_init_algo();
+}
+EXPORT_SYMBOL(seg6_hmac_init);
+
+int __net_init seg6_hmac_net_init(struct net *net)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+
+ rhashtable_init(&sdata->hmac_infos, &rht_params);
+
+ return 0;
+}
+EXPORT_SYMBOL(seg6_hmac_net_init);
+
+void seg6_hmac_exit(void)
+{
+ struct seg6_hmac_algo *algo = NULL;
+ int i, alg_count, cpu;
+
+ alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
+ for (i = 0; i < alg_count; i++) {
+ algo = &hmac_algos[i];
+ for_each_possible_cpu(cpu) {
+ struct crypto_shash *tfm;
+ struct shash_desc *shash;
+
+ shash = *per_cpu_ptr(algo->shashs, cpu);
+ kfree(shash);
+ tfm = *per_cpu_ptr(algo->tfms, cpu);
+ crypto_free_shash(tfm);
+ }
+ free_percpu(algo->tfms);
+ free_percpu(algo->shashs);
+ }
+}
+EXPORT_SYMBOL(seg6_hmac_exit);
+
+void __net_exit seg6_hmac_net_exit(struct net *net)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+
+ rhashtable_free_and_destroy(&sdata->hmac_infos, seg6_free_hi, NULL);
+}
+EXPORT_SYMBOL(seg6_hmac_net_exit);
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
new file mode 100644
index 000000000000..85582257d3af
--- /dev/null
+++ b/net/ipv6/seg6_iptunnel.c
@@ -0,0 +1,436 @@
+/*
+ * SR-IPv6 implementation
+ *
+ * Author:
+ * David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/lwtunnel.h>
+#include <net/netevent.h>
+#include <net/netns/generic.h>
+#include <net/ip6_fib.h>
+#include <net/route.h>
+#include <net/seg6.h>
+#include <linux/seg6.h>
+#include <linux/seg6_iptunnel.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#ifdef CONFIG_DST_CACHE
+#include <net/dst_cache.h>
+#endif
+#ifdef CONFIG_IPV6_SEG6_HMAC
+#include <net/seg6_hmac.h>
+#endif
+
+struct seg6_lwt {
+#ifdef CONFIG_DST_CACHE
+ struct dst_cache cache;
+#endif
+ struct seg6_iptunnel_encap tuninfo[0];
+};
+
+static inline struct seg6_lwt *seg6_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return (struct seg6_lwt *)lwt->data;
+}
+
+static inline struct seg6_iptunnel_encap *
+seg6_encap_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return seg6_lwt_lwtunnel(lwt)->tuninfo;
+}
+
+static const struct nla_policy seg6_iptunnel_policy[SEG6_IPTUNNEL_MAX + 1] = {
+ [SEG6_IPTUNNEL_SRH] = { .type = NLA_BINARY },
+};
+
+static int nla_put_srh(struct sk_buff *skb, int attrtype,
+ struct seg6_iptunnel_encap *tuninfo)
+{
+ struct seg6_iptunnel_encap *data;
+ struct nlattr *nla;
+ int len;
+
+ len = SEG6_IPTUN_ENCAP_SIZE(tuninfo);
+
+ nla = nla_reserve(skb, attrtype, len);
+ if (!nla)
+ return -EMSGSIZE;
+
+ data = nla_data(nla);
+ memcpy(data, tuninfo, len);
+
+ return 0;
+}
+
+static void set_tun_src(struct net *net, struct net_device *dev,
+ struct in6_addr *daddr, struct in6_addr *saddr)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+ struct in6_addr *tun_src;
+
+ rcu_read_lock();
+
+ tun_src = rcu_dereference(sdata->tun_src);
+
+ if (!ipv6_addr_any(tun_src)) {
+ memcpy(saddr, tun_src, sizeof(struct in6_addr));
+ } else {
+ ipv6_dev_get_saddr(net, dev, daddr, IPV6_PREFER_SRC_PUBLIC,
+ saddr);
+ }
+
+ rcu_read_unlock();
+}
+
+/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
+static int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
+{
+ struct net *net = dev_net(skb_dst(skb)->dev);
+ struct ipv6hdr *hdr, *inner_hdr;
+ struct ipv6_sr_hdr *isrh;
+ int hdrlen, tot_len, err;
+
+ hdrlen = (osrh->hdrlen + 1) << 3;
+ tot_len = hdrlen + sizeof(*hdr);
+
+ err = pskb_expand_head(skb, tot_len, 0, GFP_ATOMIC);
+ if (unlikely(err))
+ return err;
+
+ inner_hdr = ipv6_hdr(skb);
+
+ skb_push(skb, tot_len);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+ hdr = ipv6_hdr(skb);
+
+ /* inherit tc, flowlabel and hlim
+ * hlim will be decremented in ip6_forward() afterwards and
+ * decapsulation will overwrite inner hlim with outer hlim
+ */
+ ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
+ ip6_flowlabel(inner_hdr));
+ hdr->hop_limit = inner_hdr->hop_limit;
+ hdr->nexthdr = NEXTHDR_ROUTING;
+
+ isrh = (void *)hdr + sizeof(*hdr);
+ memcpy(isrh, osrh, hdrlen);
+
+ isrh->nexthdr = NEXTHDR_IPV6;
+
+ hdr->daddr = isrh->segments[isrh->first_segment];
+ set_tun_src(net, skb->dev, &hdr->daddr, &hdr->saddr);
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (sr_has_hmac(isrh)) {
+ err = seg6_push_hmac(net, &hdr->saddr, isrh);
+ if (unlikely(err))
+ return err;
+ }
+#endif
+
+ skb_postpush_rcsum(skb, hdr, tot_len);
+
+ return 0;
+}
+
+/* insert an SRH within an IPv6 packet, just after the IPv6 header */
+#ifdef CONFIG_IPV6_SEG6_INLINE
+static int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
+{
+ struct ipv6hdr *hdr, *oldhdr;
+ struct ipv6_sr_hdr *isrh;
+ int hdrlen, err;
+
+ hdrlen = (osrh->hdrlen + 1) << 3;
+
+ err = pskb_expand_head(skb, hdrlen, 0, GFP_ATOMIC);
+ if (unlikely(err))
+ return err;
+
+ oldhdr = ipv6_hdr(skb);
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ sizeof(struct ipv6hdr));
+
+ skb_push(skb, sizeof(struct ipv6hdr) + hdrlen);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ hdr = ipv6_hdr(skb);
+
+ memmove(hdr, oldhdr, sizeof(*hdr));
+
+ isrh = (void *)hdr + sizeof(*hdr);
+ memcpy(isrh, osrh, hdrlen);
+
+ isrh->nexthdr = hdr->nexthdr;
+ hdr->nexthdr = NEXTHDR_ROUTING;
+
+ isrh->segments[0] = hdr->daddr;
+ hdr->daddr = isrh->segments[isrh->first_segment];
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (sr_has_hmac(isrh)) {
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ err = seg6_push_hmac(net, &hdr->saddr, isrh);
+ if (unlikely(err))
+ return err;
+ }
+#endif
+
+ skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen);
+
+ return 0;
+}
+#endif
+
+static int seg6_do_srh(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct seg6_iptunnel_encap *tinfo;
+ int err = 0;
+
+ tinfo = seg6_encap_lwtunnel(dst->lwtstate);
+
+ if (likely(!skb->encapsulation)) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ switch (tinfo->mode) {
+#ifdef CONFIG_IPV6_SEG6_INLINE
+ case SEG6_IPTUN_MODE_INLINE:
+ err = seg6_do_srh_inline(skb, tinfo->srh);
+ skb_reset_inner_headers(skb);
+ break;
+#endif
+ case SEG6_IPTUN_MODE_ENCAP:
+ err = seg6_do_srh_encap(skb, tinfo->srh);
+ break;
+ }
+
+ if (err)
+ return err;
+
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ skb_set_inner_protocol(skb, skb->protocol);
+
+ return 0;
+}
+
+static int seg6_input(struct sk_buff *skb)
+{
+ int err;
+
+ err = seg6_do_srh(skb);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ skb_dst_drop(skb);
+ ip6_route_input(skb);
+
+ return dst_input(skb);
+}
+
+static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct dst_entry *dst = NULL;
+ struct seg6_lwt *slwt;
+ int err = -EINVAL;
+
+ err = seg6_do_srh(skb);
+ if (unlikely(err))
+ goto drop;
+
+ slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate);
+
+#ifdef CONFIG_DST_CACHE
+ preempt_disable();
+ dst = dst_cache_get(&slwt->cache);
+ preempt_enable();
+#endif
+
+ if (unlikely(!dst)) {
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct flowi6 fl6;
+
+ fl6.daddr = hdr->daddr;
+ fl6.saddr = hdr->saddr;
+ fl6.flowlabel = ip6_flowinfo(hdr);
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = hdr->nexthdr;
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ err = dst->error;
+ dst_release(dst);
+ goto drop;
+ }
+
+#ifdef CONFIG_DST_CACHE
+ preempt_disable();
+ dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr);
+ preempt_enable();
+#endif
+ }
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ return dst_output(net, sk, skb);
+drop:
+ kfree_skb(skb);
+ return err;
+}
+
+static int seg6_build_state(struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts)
+{
+ struct nlattr *tb[SEG6_IPTUNNEL_MAX + 1];
+ struct seg6_iptunnel_encap *tuninfo;
+ struct lwtunnel_state *newts;
+ int tuninfo_len, min_size;
+ struct seg6_lwt *slwt;
+ int err;
+
+ err = nla_parse_nested(tb, SEG6_IPTUNNEL_MAX, nla,
+ seg6_iptunnel_policy);
+
+ if (err < 0)
+ return err;
+
+ if (!tb[SEG6_IPTUNNEL_SRH])
+ return -EINVAL;
+
+ tuninfo = nla_data(tb[SEG6_IPTUNNEL_SRH]);
+ tuninfo_len = nla_len(tb[SEG6_IPTUNNEL_SRH]);
+
+ /* tuninfo must contain at least the iptunnel encap structure,
+ * the SRH and one segment
+ */
+ min_size = sizeof(*tuninfo) + sizeof(struct ipv6_sr_hdr) +
+ sizeof(struct in6_addr);
+ if (tuninfo_len < min_size)
+ return -EINVAL;
+
+ switch (tuninfo->mode) {
+#ifdef CONFIG_IPV6_SEG6_INLINE
+ case SEG6_IPTUN_MODE_INLINE:
+ break;
+#endif
+ case SEG6_IPTUN_MODE_ENCAP:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* verify that SRH is consistent */
+ if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo)))
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(tuninfo_len + sizeof(*slwt));
+ if (!newts)
+ return -ENOMEM;
+
+ slwt = seg6_lwt_lwtunnel(newts);
+
+#ifdef CONFIG_DST_CACHE
+ err = dst_cache_init(&slwt->cache, GFP_KERNEL);
+ if (err) {
+ kfree(newts);
+ return err;
+ }
+#endif
+
+ memcpy(&slwt->tuninfo, tuninfo, tuninfo_len);
+
+ newts->type = LWTUNNEL_ENCAP_SEG6;
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
+ LWTUNNEL_STATE_INPUT_REDIRECT;
+ newts->headroom = seg6_lwt_headroom(tuninfo);
+
+ *ts = newts;
+
+ return 0;
+}
+
+#ifdef CONFIG_DST_CACHE
+static void seg6_destroy_state(struct lwtunnel_state *lwt)
+{
+ dst_cache_destroy(&seg6_lwt_lwtunnel(lwt)->cache);
+}
+#endif
+
+static int seg6_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate);
+
+ if (nla_put_srh(skb, SEG6_IPTUNNEL_SRH, tuninfo))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int seg6_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate);
+
+ return nla_total_size(SEG6_IPTUN_ENCAP_SIZE(tuninfo));
+}
+
+static int seg6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct seg6_iptunnel_encap *a_hdr = seg6_encap_lwtunnel(a);
+ struct seg6_iptunnel_encap *b_hdr = seg6_encap_lwtunnel(b);
+ int len = SEG6_IPTUN_ENCAP_SIZE(a_hdr);
+
+ if (len != SEG6_IPTUN_ENCAP_SIZE(b_hdr))
+ return 1;
+
+ return memcmp(a_hdr, b_hdr, len);
+}
+
+static const struct lwtunnel_encap_ops seg6_iptun_ops = {
+ .build_state = seg6_build_state,
+#ifdef CONFIG_DST_CACHE
+ .destroy_state = seg6_destroy_state,
+#endif
+ .output = seg6_output,
+ .input = seg6_input,
+ .fill_encap = seg6_fill_encap_info,
+ .get_encap_size = seg6_encap_nlsize,
+ .cmp_encap = seg6_encap_cmp,
+ .owner = THIS_MODULE,
+};
+
+int __init seg6_iptunnel_init(void)
+{
+ return lwtunnel_encap_add_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6);
+}
+
+void seg6_iptunnel_exit(void)
+{
+ lwtunnel_encap_del_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6);
+}
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index b1cdf8009d29..99853c6e33a8 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -31,7 +31,7 @@
#include <linux/if_arp.h>
#include <linux/icmp.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/init.h>
#include <linux/netfilter_ipv4.h>
#include <linux/if_ether.h>
@@ -76,7 +76,7 @@ static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst,
__be32 *v4dst);
static struct rtnl_link_ops sit_link_ops __read_mostly;
-static int sit_net_id __read_mostly;
+static unsigned int sit_net_id __read_mostly;
struct sit_net {
struct ip_tunnel __rcu *tunnels_r_l[IP6_SIT_HASH_SIZE];
struct ip_tunnel __rcu *tunnels_r[IP6_SIT_HASH_SIZE];
@@ -1318,23 +1318,11 @@ done:
return err;
}
-static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu)
-{
- struct ip_tunnel *tunnel = netdev_priv(dev);
- int t_hlen = tunnel->hlen + sizeof(struct iphdr);
-
- if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - t_hlen)
- return -EINVAL;
- dev->mtu = new_mtu;
- return 0;
-}
-
static const struct net_device_ops ipip6_netdev_ops = {
.ndo_init = ipip6_tunnel_init,
.ndo_uninit = ipip6_tunnel_uninit,
.ndo_start_xmit = sit_tunnel_xmit,
.ndo_do_ioctl = ipip6_tunnel_ioctl,
- .ndo_change_mtu = ipip6_tunnel_change_mtu,
.ndo_get_stats64 = ip_tunnel_get_stats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
};
@@ -1365,6 +1353,8 @@ static void ipip6_tunnel_setup(struct net_device *dev)
dev->type = ARPHRD_SIT;
dev->hard_header_len = LL_MAX_HEADER + t_hlen;
dev->mtu = ETH_DATA_LEN - t_hlen;
+ dev->min_mtu = IPV6_MIN_MTU;
+ dev->max_mtu = 0xFFF8 - t_hlen;
dev->flags = IFF_NOARP;
netif_keep_dst(dev);
dev->addr_len = 4;
@@ -1390,6 +1380,7 @@ static int ipip6_tunnel_init(struct net_device *dev)
err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
if (err) {
free_percpu(dev->tstats);
+ dev->tstats = NULL;
return err;
}
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 59c483937aec..895ff650db43 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -16,7 +16,7 @@
#include <linux/tcp.h>
#include <linux/random.h>
-#include <linux/cryptohash.h>
+#include <linux/siphash.h>
#include <linux/kernel.h>
#include <net/ipv6.h>
#include <net/tcp.h>
@@ -24,7 +24,7 @@
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
-static u32 syncookie6_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
+static siphash_key_t syncookie6_secret[2] __read_mostly;
/* RFC 2460, Section 8.3:
* [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..]
@@ -41,30 +41,27 @@ static __u16 const msstab[] = {
9000 - 60,
};
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch);
-
-static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr,
+static u32 cookie_hash(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
__be16 sport, __be16 dport, u32 count, int c)
{
- __u32 *tmp;
+ const struct {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ u32 count;
+ __be16 sport;
+ __be16 dport;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .saddr = *saddr,
+ .daddr = *daddr,
+ .count = count,
+ .sport = sport,
+ .dport = dport
+ };
net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret));
-
- tmp = this_cpu_ptr(ipv6_cookie_scratch);
-
- /*
- * we have 320 bits of information to hash, copy in the remaining
- * 192 bits required for sha_transform, from the syncookie6_secret
- * and overwrite the digest with the secret
- */
- memcpy(tmp + 10, syncookie6_secret[c], 44);
- memcpy(tmp, saddr, 16);
- memcpy(tmp + 4, daddr, 16);
- tmp[8] = ((__force u32)sport << 16) + (__force u32)dport;
- tmp[9] = count;
- sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
-
- return tmp[17];
+ return siphash(&combined, offsetofend(typeof(combined), dport),
+ &syncookie6_secret[c]);
}
static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr,
@@ -209,6 +206,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
treq->snt_synack.v64 = 0;
treq->rcv_isn = ntohl(th->seq) - 1;
treq->snt_isn = cookie;
+ treq->ts_off = 0;
/*
* We need to lookup the dst_entry to get the correct window size.
@@ -227,6 +225,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
fl6.flowi6_mark = ireq->ir_mark;
fl6.fl6_dport = ireq->ir_rmt_port;
fl6.fl6_sport = inet_sk(sk)->inet_sport;
+ fl6.flowi6_uid = sk->sk_uid;
security_req_classify_flow(req, flowi6_to_flowi(&fl6));
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b9f1fee9a886..49fa2e8c3fa9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -101,12 +101,12 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
}
}
-static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)
+static u32 tcp_v6_init_sequence(const struct sk_buff *skb, u32 *tsoff)
{
return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
ipv6_hdr(skb)->saddr.s6_addr32,
tcp_hdr(skb)->dest,
- tcp_hdr(skb)->source);
+ tcp_hdr(skb)->source, tsoff);
}
static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
@@ -122,7 +122,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
struct flowi6 fl6;
struct dst_entry *dst;
int addr_type;
+ u32 seq;
int err;
+ struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
@@ -148,8 +150,13 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
* connect() to INADDR_ANY means loopback (BSD'ism).
*/
- if (ipv6_addr_any(&usin->sin6_addr))
- usin->sin6_addr.s6_addr[15] = 0x1;
+ if (ipv6_addr_any(&usin->sin6_addr)) {
+ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+ ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
+ &usin->sin6_addr);
+ else
+ usin->sin6_addr = in6addr_loopback;
+ }
addr_type = ipv6_addr_type(&usin->sin6_addr);
@@ -188,7 +195,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
* TCP over IPv4
*/
- if (addr_type == IPV6_ADDR_MAPPED) {
+ if (addr_type & IPV6_ADDR_MAPPED) {
u32 exthdrlen = icsk->icsk_ext_hdr_len;
struct sockaddr_in sin;
@@ -233,6 +240,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_dport = usin->sin6_port;
fl6.fl6_sport = inet->inet_sport;
+ fl6.flowi6_uid = sk->sk_uid;
opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
final_p = fl6_update_dst(&fl6, opt, &final);
@@ -257,7 +265,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
sk->sk_gso_type = SKB_GSO_TCPV6;
ip6_dst_store(sk, dst, NULL, NULL);
- if (tcp_death_row.sysctl_tw_recycle &&
+ if (tcp_death_row->sysctl_tw_recycle &&
!tp->rx_opt.ts_recent_stamp &&
ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr))
tcp_fetch_timewait_stamp(sk, dst);
@@ -272,17 +280,26 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
inet->inet_dport = usin->sin6_port;
tcp_set_state(sk, TCP_SYN_SENT);
- err = inet6_hash_connect(&tcp_death_row, sk);
+ err = inet6_hash_connect(tcp_death_row, sk);
if (err)
goto late_failure;
sk_set_txhash(sk);
- if (!tp->write_seq && likely(!tp->repair))
- tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
- sk->sk_v6_daddr.s6_addr32,
- inet->inet_sport,
- inet->inet_dport);
+ if (likely(!tp->repair)) {
+ seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
+ sk->sk_v6_daddr.s6_addr32,
+ inet->inet_sport,
+ inet->inet_dport,
+ &tp->tsoffset);
+ if (!tp->write_seq)
+ tp->write_seq = seq;
+ }
+
+ if (tcp_fastopen_defer_connect(sk, &err))
+ return err;
+ if (err)
+ goto late_failure;
err = tcp_connect(sk);
if (err)
@@ -292,7 +309,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
late_failure:
tcp_set_state(sk, TCP_CLOSE);
- __sk_dst_reset(sk);
failure:
inet->inet_dport = 0;
sk->sk_route_caps = 0;
@@ -375,10 +391,12 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
np = inet6_sk(sk);
if (type == NDISC_REDIRECT) {
- struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
+ if (!sock_owned_by_user(sk)) {
+ struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
- if (dst)
- dst->ops->redirect(dst, sk, skb);
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
+ }
goto out;
}
@@ -397,7 +415,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!sock_owned_by_user(sk))
tcp_v6_mtu_reduced(sk);
else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
- &tp->tsq_flags))
+ &sk->sk_tsq_flags))
sock_hold(sk);
goto out;
}
@@ -467,7 +485,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
opt = ireq->ipv6_opt;
if (!opt)
opt = rcu_dereference(np->opt);
- err = ip6_xmit(sk, skb, fl6, opt, np->tclass);
+ err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -828,6 +846,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
fl6.fl6_dport = t1->dest;
fl6.fl6_sport = t1->source;
+ fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
/* Pass a socket to ip6_dst_lookup either it is for RST
@@ -837,7 +856,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
if (!IS_ERR(dst)) {
skb_dst_set(buff, dst);
- ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass);
+ ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass);
TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
if (rst)
TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
@@ -954,7 +973,8 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
- tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if,
+ tcp_time_stamp + tcp_rsk(req)->ts_off,
+ req->ts_recent, sk->sk_bound_dev_if,
tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
0, 0);
}
@@ -987,6 +1007,16 @@ drop:
return 0; /* don't send reset */
}
+static void tcp_v6_restore_cb(struct sk_buff *skb)
+{
+ /* We need to move header back to the beginning if xfrm6_policy_check()
+ * and tcp_v6_fill_cb() are going to be called again.
+ * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
+ */
+ memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
+ sizeof(struct inet6_skb_parm));
+}
+
static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst,
@@ -1138,10 +1168,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
tcp_ca_openreq_child(newsk, dst);
tcp_sync_mss(newsk, dst_mtu(dst));
- newtp->advmss = dst_metric_advmss(dst);
- if (tcp_sk(sk)->rx_opt.user_mss &&
- tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
- newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+ newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
tcp_initialize_rcv_mss(newsk);
@@ -1178,8 +1205,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
sk_gfp_mask(sk, GFP_ATOMIC));
consume_skb(ireq->pktopts);
ireq->pktopts = NULL;
- if (newnp->pktoptions)
+ if (newnp->pktoptions) {
+ tcp_v6_restore_cb(newnp->pktoptions);
skb_set_owner_r(newnp->pktoptions, newsk);
+ }
}
}
@@ -1194,16 +1223,6 @@ out:
return NULL;
}
-static void tcp_v6_restore_cb(struct sk_buff *skb)
-{
- /* We need to move header back to the beginning if xfrm6_policy_check()
- * and tcp_v6_fill_cb() are going to be called again.
- * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
- */
- memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
- sizeof(struct inet6_skb_parm));
-}
-
/* The socket must have it's spinlock held when we get
* here, unless it is a TCP_LISTEN socket.
*
@@ -1616,7 +1635,6 @@ static const struct inet_connection_sock_af_ops ipv6_specific = {
.getsockopt = ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in6),
- .bind_conflict = inet6_csk_bind_conflict,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,
@@ -1647,7 +1665,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
.getsockopt = ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in6),
- .bind_conflict = inet6_csk_bind_conflict,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,
@@ -1740,7 +1757,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
srcp = ntohs(inet->inet_sport);
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
timer_expires = icsk->icsk_timeout;
@@ -1884,6 +1901,7 @@ struct proto tcpv6_prot = {
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
+ .keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
@@ -1944,7 +1962,7 @@ static void __net_exit tcpv6_net_exit(struct net *net)
static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list)
{
- inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET6);
+ inet_twsk_purge(&tcp_hashinfo, AF_INET6);
}
static struct pernet_operations tcpv6_net_ops = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e4a8000d59ad..e28082f0a307 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -35,7 +35,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/addrconf.h>
#include <net/ndisc.h>
@@ -55,6 +55,16 @@
#include <trace/events/skb.h>
#include "udp_impl.h"
+static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
+{
+#if defined(CONFIG_NET_L3_MASTER_DEV)
+ if (!net->ipv4.sysctl_udp_l3mdev_accept &&
+ skb && ipv6_l3mdev_skb(IP6CB(skb)->flags))
+ return true;
+#endif
+ return false;
+}
+
static u32 udp6_ehashfn(const struct net *net,
const struct in6_addr *laddr,
const u16 lport,
@@ -103,7 +113,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
/* precompute partial secondary hash */
udp_sk(sk)->udp_portaddr_hash = hash2_partial;
- return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr);
+ return udp_lib_get_port(sk, snum, hash2_nulladdr);
}
static void udp_v6_rehash(struct sock *sk)
@@ -118,7 +128,7 @@ static void udp_v6_rehash(struct sock *sk)
static int compute_score(struct sock *sk, struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, unsigned short hnum,
- int dif)
+ int dif, bool exact_dif)
{
int score;
struct inet_sock *inet;
@@ -149,7 +159,7 @@ static int compute_score(struct sock *sk, struct net *net,
score++;
}
- if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if || exact_dif) {
if (sk->sk_bound_dev_if != dif)
return -1;
score++;
@@ -165,7 +175,7 @@ static int compute_score(struct sock *sk, struct net *net,
static struct sock *udp6_lib_lookup2(struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, unsigned int hnum, int dif,
- struct udp_hslot *hslot2,
+ bool exact_dif, struct udp_hslot *hslot2,
struct sk_buff *skb)
{
struct sock *sk, *result;
@@ -176,7 +186,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
badness = -1;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif);
+ daddr, hnum, dif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -212,6 +222,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
unsigned short hnum = ntohs(dport);
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
+ bool exact_dif = udp6_lib_exact_dif_match(net, skb);
int score, badness, matches = 0, reuseport = 0;
u32 hash = 0;
@@ -223,7 +234,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
goto begin;
result = udp6_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif,
+ daddr, hnum, dif, exact_dif,
hslot2, skb);
if (!result) {
unsigned int old_slot2 = slot2;
@@ -239,7 +250,8 @@ struct sock *__udp6_lib_lookup(struct net *net,
result = udp6_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
- hslot2, skb);
+ exact_dif, hslot2,
+ skb);
}
return result;
}
@@ -247,7 +259,8 @@ begin:
result = NULL;
badness = -1;
sk_for_each_rcu(sk, &hslot->head) {
- score = compute_score(sk, net, saddr, sport, daddr, hnum, dif);
+ score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
+ exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -302,7 +315,8 @@ EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
* Does increment socket refcount.
*/
#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
- IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY)
+ IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \
+ IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport, int dif)
{
@@ -334,7 +348,6 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false;
int is_udp4;
- bool slow;
if (flags & MSG_ERRQUEUE)
return ipv6_recv_error(sk, msg, len, addr_len);
@@ -344,8 +357,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
try_again:
peeking = off = sk_peek_offset(sk, flags);
- skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
- &peeked, &off, &err);
+ skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
if (!skb)
return err;
@@ -364,7 +376,8 @@ try_again:
* coverage checksum (UDP-Lite), do it before the copy.
*/
- if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
+ if (copied < ulen || peeking ||
+ (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
checksum_valid = !udp_lib_checksum_complete(skb);
if (!checksum_valid)
goto csum_copy_err;
@@ -378,7 +391,6 @@ try_again:
goto csum_copy_err;
}
if (unlikely(err)) {
- trace_kfree_skb(skb, udpv6_recvmsg);
if (!peeked) {
atomic_inc(&sk->sk_drops);
if (is_udp4)
@@ -388,7 +400,7 @@ try_again:
UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
is_udplite);
}
- skb_free_datagram_locked(sk, skb);
+ kfree_skb(skb);
return err;
}
if (!peeked) {
@@ -427,7 +439,7 @@ try_again:
if (is_udp4) {
if (inet->cmsg_flags)
- ip_cmsg_recv_offset(msg, skb,
+ ip_cmsg_recv_offset(msg, sk, skb,
sizeof(struct udphdr), off);
} else {
if (np->rxopt.all)
@@ -438,12 +450,11 @@ try_again:
if (flags & MSG_TRUNC)
err = ulen;
- __skb_free_datagram_locked(sk, skb, peeking ? -err : err);
+ skb_consume_udp(sk, skb, peeking ? -err : err);
return err;
csum_copy_err:
- slow = lock_sock_fast(sk);
- if (!skb_kill_datagram(sk, skb, flags)) {
+ if (!__sk_queue_drop_skb(sk, skb, flags, udp_skb_destructor)) {
if (is_udp4) {
UDP_INC_STATS(sock_net(sk),
UDP_MIB_CSUMERRORS, is_udplite);
@@ -456,7 +467,7 @@ csum_copy_err:
UDP_MIB_INERRORS, is_udplite);
}
}
- unlock_sock_fast(sk, slow);
+ kfree_skb(skb);
/* starting over for a new packet, but check if we need to yield */
cond_resched();
@@ -522,9 +533,11 @@ int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
sk_incoming_cpu_update(sk);
+ } else {
+ sk_mark_napi_id_once(sk, skb);
}
- rc = __sock_queue_rcv_skb(sk, skb);
+ rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
@@ -536,6 +549,7 @@ int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
kfree_skb(skb);
return -1;
}
+
return 0;
}
@@ -557,7 +571,6 @@ EXPORT_SYMBOL(udpv6_encap_enable);
int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
- int rc;
int is_udplite = IS_UDPLITE(sk);
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
@@ -623,25 +636,10 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto drop;
udp_csum_pull_header(skb);
- if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
- __UDP6_INC_STATS(sock_net(sk),
- UDP_MIB_RCVBUFERRORS, is_udplite);
- goto drop;
- }
skb_dst_drop(skb);
- bh_lock_sock(sk);
- rc = 0;
- if (!sock_owned_by_user(sk))
- rc = __udpv6_queue_rcv_skb(sk, skb);
- else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
- bh_unlock_sock(sk);
- goto drop;
- }
- bh_unlock_sock(sk);
-
- return rc;
+ return __udpv6_queue_rcv_skb(sk, skb);
csum_error:
__UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
@@ -1037,6 +1035,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc6.hlimit = -1;
ipc6.tclass = -1;
ipc6.dontfrag = -1;
+ sockc.tsflags = sk->sk_tsflags;
/* destination address check */
if (sin6) {
@@ -1048,6 +1047,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
daddr = &sin6->sin6_addr;
+ if (ipv6_addr_any(daddr) &&
+ ipv6_addr_v4mapped(&np->saddr))
+ ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
+ daddr);
break;
case AF_INET:
goto do_udp_sendmsg;
@@ -1156,7 +1159,7 @@ do_udp_sendmsg:
fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
fl6.flowi6_mark = sk->sk_mark;
- sockc.tsflags = sk->sk_tsflags;
+ fl6.flowi6_uid = sk->sk_uid;
if (msg->msg_controllen) {
opt = &opt_space;
@@ -1309,7 +1312,8 @@ out:
return err;
do_confirm:
- dst_confirm(dst);
+ if (msg->msg_flags & MSG_PROBE)
+ dst_confirm_neigh(dst, &fl6.daddr);
if (!(msg->msg_flags&MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
@@ -1434,12 +1438,12 @@ struct proto udpv6_prot = {
.connect = ip6_datagram_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
+ .init = udp_init_sock,
.destroy = udpv6_destroy_sock,
.setsockopt = udpv6_setsockopt,
.getsockopt = udpv6_getsockopt,
.sendmsg = udpv6_sendmsg,
.recvmsg = udpv6_recvmsg,
- .backlog_rcv = __udpv6_queue_rcv_skb,
.release_cb = ip6_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 2f5101a12283..2784cc363f2b 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -45,10 +45,11 @@ struct proto udplitev6_prot = {
.getsockopt = udpv6_getsockopt,
.sendmsg = udpv6_sendmsg,
.recvmsg = udpv6_recvmsg,
- .backlog_rcv = __udpv6_queue_rcv_skb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.get_port = udp_v6_get_port,
+ .memory_allocated = &udp_memory_allocated,
+ .sysctl_mem = sysctl_udp_mem,
.obj_size = sizeof(struct udp6_sock),
.h.udp_table = &udplite_table,
#ifdef CONFIG_COMPAT
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index b5789562aded..08a807b29298 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -33,6 +33,8 @@ EXPORT_SYMBOL(xfrm6_rcv_spi);
int xfrm6_transport_finish(struct sk_buff *skb, int async)
{
+ struct xfrm_offload *xo = xfrm_offload(skb);
+
skb_network_header(skb)[IP6CB(skb)->nhoff] =
XFRM_MODE_SKB_CB(skb)->protocol;
@@ -44,6 +46,11 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
ipv6_hdr(skb)->payload_len = htons(skb->len);
__skb_push(skb, skb->data - skb_network_header(skb));
+ if (xo && (xo->flags & XFRM_GRO)) {
+ skb_mac_header_rebuild(skb);
+ return -1;
+ }
+
NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
dev_net(skb->dev), NULL, skb, skb->dev, NULL,
ip6_rcv_finish);
@@ -69,18 +76,9 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
struct xfrm_state *x = NULL;
int i = 0;
- /* Allocate new secpath or COW existing one. */
- if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
- struct sec_path *sp;
-
- sp = secpath_dup(skb->sp);
- if (!sp) {
- XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
- goto drop;
- }
- if (skb->sp)
- secpath_put(skb->sp);
- skb->sp = sp;
+ if (secpath_set(skb)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
+ goto drop;
}
if (1 + skb->sp->len == XFRM_MAX_DEPTH) {
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
index 4e344105b3fd..4439ee44c8b0 100644
--- a/net/ipv6/xfrm6_mode_transport.c
+++ b/net/ipv6/xfrm6_mode_transport.c
@@ -47,6 +47,7 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb)
static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
{
int ihl = skb->data - skb_transport_header(skb);
+ struct xfrm_offload *xo = xfrm_offload(skb);
if (skb->transport_header != skb->network_header) {
memmove(skb_transport_header(skb),
@@ -55,7 +56,8 @@ static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
}
ipv6_hdr(skb)->payload_len = htons(skb->len + ihl -
sizeof(struct ipv6hdr));
- skb_reset_transport_header(skb);
+ if (!xo || !(xo->flags & XFRM_GRO))
+ skb_reset_transport_header(skb);
return 0;
}
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index e0f71c01d728..79651bc71bf0 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -25,8 +25,6 @@
#include <net/mip6.h>
#endif
-static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
-
static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
const xfrm_address_t *saddr,
const xfrm_address_t *daddr)
@@ -220,7 +218,7 @@ static inline int xfrm6_garbage_collect(struct dst_ops *ops)
{
struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops);
- xfrm6_policy_afinfo.garbage_collect(net);
+ xfrm_garbage_collect_deferred(net);
return dst_entries_get_fast(ops) > ops->gc_thresh * 2;
}
@@ -291,8 +289,7 @@ static struct dst_ops xfrm6_dst_ops_template = {
.gc_thresh = INT_MAX,
};
-static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
- .family = AF_INET6,
+static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
.dst_ops = &xfrm6_dst_ops_template,
.dst_lookup = xfrm6_dst_lookup,
.get_saddr = xfrm6_get_saddr,
@@ -305,7 +302,7 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
static int __init xfrm6_policy_init(void)
{
- return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo);
+ return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo, AF_INET6);
}
static void xfrm6_policy_fini(void)
diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c
index 54d13f8dbbae..b2dc8ce49378 100644
--- a/net/ipv6/xfrm6_protocol.c
+++ b/net/ipv6/xfrm6_protocol.c
@@ -162,9 +162,8 @@ static const struct inet6_protocol ipcomp6_protocol = {
.flags = INET6_PROTO_NOPOLICY,
};
-static struct xfrm_input_afinfo xfrm6_input_afinfo = {
+static const struct xfrm_input_afinfo xfrm6_input_afinfo = {
.family = AF_INET6,
- .owner = THIS_MODULE,
.callback = xfrm6_rcv_cb,
};
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index e1c0bbe7996c..d7b731a78d09 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -44,7 +44,7 @@ struct xfrm6_tunnel_net {
u32 spi;
};
-static int xfrm6_tunnel_net_id __read_mostly;
+static unsigned int xfrm6_tunnel_net_id __read_mostly;
static inline struct xfrm6_tunnel_net *xfrm6_tunnel_pernet(struct net *net)
{
return net_generic(net, xfrm6_tunnel_net_id);
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 48d0dc89b58d..8a9219ff2e77 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -56,7 +56,7 @@
#include <net/tcp_states.h>
#include <net/net_namespace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/* Configuration Variables */
static unsigned char ipxcfg_max_hops = 16;
@@ -1809,7 +1809,7 @@ static int ipx_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
rc = skb_copy_datagram_msg(skb, sizeof(struct ipxhdr), msg, copied);
if (rc)
goto out_free;
- if (skb->tstamp.tv64)
+ if (skb->tstamp)
sk->sk_stamp = skb->tstamp;
if (sipx) {
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 391c3cbd2eed..8d77ad5cadaf 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -46,13 +46,14 @@
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/slab.h>
+#include <linux/sched/signal.h>
#include <linux/init.h>
#include <linux/net.h>
#include <linux/irda.h>
#include <linux/poll.h>
#include <asm/ioctls.h> /* TIOCOUTQ, TIOCINQ */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/sock.h>
#include <net/tcp_states.h>
@@ -827,7 +828,8 @@ out:
* Wait for incoming connection
*
*/
-static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
+static int irda_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern)
{
struct sock *sk = sock->sk;
struct irda_sock *new, *self = irda_sk(sk);
@@ -835,7 +837,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
struct sk_buff *skb = NULL;
int err;
- err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0);
+ err = irda_create(sock_net(sk), newsock, sk->sk_protocol, kern);
if (err)
return err;
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index 873c4b707d6a..f6061c4bb0a8 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -32,7 +32,7 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/seq_file.h>
#include <linux/termios.h>
#include <linux/tty.h>
@@ -40,7 +40,7 @@
#include <linux/interrupt.h>
#include <linux/device.h> /* for MODULE_ALIAS_CHARDEV_MAJOR */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/irda/irda.h>
#include <net/irda/irmod.h>
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index 8f5678cb6263..f18070118d05 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -32,7 +32,7 @@
#include <linux/tty.h>
#include <linux/serial.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/irda/irda.h>
#include <net/irda/irmod.h>
diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c
index 856736656a30..890b90d055d5 100644
--- a/net/irda/irda_device.c
+++ b/net/irda/irda_device.c
@@ -43,7 +43,7 @@
#include <linux/export.h>
#include <asm/ioctls.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/dma.h>
#include <asm/io.h>
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index d8b7267280c3..74d09f91709e 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -51,7 +51,6 @@ static const struct net_device_ops irlan_eth_netdev_ops = {
.ndo_stop = irlan_eth_close,
.ndo_start_xmit = irlan_eth_xmit,
.ndo_set_rx_mode = irlan_eth_set_multicast_list,
- .ndo_change_mtu = eth_change_mtu,
.ndo_validate_addr = eth_validate_addr,
};
@@ -67,7 +66,8 @@ static void irlan_eth_setup(struct net_device *dev)
dev->netdev_ops = &irlan_eth_netdev_ops;
dev->destructor = free_netdev;
-
+ dev->min_mtu = 0;
+ dev->max_mtu = ETH_MAX_MTU;
/*
* Lets do all queueing in IrTTP instead of this device driver.
diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h
index 8d65bb9477fc..9d451f8ed47a 100644
--- a/net/irda/irnet/irnet.h
+++ b/net/irda/irnet/irnet.h
@@ -245,12 +245,11 @@
#include <linux/tty.h>
#include <linux/proc_fs.h>
#include <linux/netdevice.h>
-#include <linux/miscdevice.h>
#include <linux/poll.h>
#include <linux/capability.h>
#include <linux/ctype.h> /* isspace() */
#include <linux/string.h> /* skip_spaces() */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/init.h>
#include <linux/ppp_defs.h>
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index 1215693fdd22..7025dcb853d0 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -13,8 +13,9 @@
* 2) as a control channel (write commands, read events)
*/
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/slab.h>
+
#include "irnet_ppp.h" /* Private header */
/* Please put other headers in irnet.h - Thanks */
@@ -51,7 +52,7 @@ irnet_ctrl_write(irnet_socket * ap,
char * next; /* Next command to process */
int length; /* Length of current command */
- DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count);
+ DENTER(CTRL_TRACE, "(ap=0x%p, count=%zd)\n", ap, count);
/* Check for overflow... */
DABORT(count >= IRNET_MAX_COMMAND, -ENOMEM,
@@ -66,7 +67,7 @@ irnet_ctrl_write(irnet_socket * ap,
/* Safe terminate the string */
command[count] = '\0';
- DEBUG(CTRL_INFO, "Command line received is ``%s'' (%Zd).\n",
+ DEBUG(CTRL_INFO, "Command line received is ``%s'' (%zd).\n",
command, count);
/* Check every commands in the command line */
@@ -285,7 +286,7 @@ irnet_ctrl_read(irnet_socket * ap,
char event[75];
ssize_t ret = 0;
- DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count);
+ DENTER(CTRL_TRACE, "(ap=0x%p, count=%zd)\n", ap, count);
#ifdef INITIAL_DISCOVERY
/* Check if we have read the log */
@@ -328,7 +329,7 @@ irnet_ctrl_read(irnet_socket * ap,
if(ret != 0)
{
/* No, return the error code */
- DEXIT(CTRL_TRACE, " - ret %Zd\n", ret);
+ DEXIT(CTRL_TRACE, " - ret %zd\n", ret);
return ret;
}
@@ -568,7 +569,7 @@ dev_irnet_write(struct file * file,
{
irnet_socket * ap = file->private_data;
- DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n",
+ DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%zd)\n",
file, ap, count);
DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n");
@@ -592,7 +593,7 @@ dev_irnet_read(struct file * file,
{
irnet_socket * ap = file->private_data;
- DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n",
+ DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%zd)\n",
file, ap, count);
DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n");
diff --git a/net/irda/irnet/irnet_ppp.h b/net/irda/irnet/irnet_ppp.h
index 940225866da0..32061442cc8e 100644
--- a/net/irda/irnet/irnet_ppp.h
+++ b/net/irda/irnet/irnet_ppp.h
@@ -15,13 +15,10 @@
/***************************** INCLUDES *****************************/
#include "irnet.h" /* Module global include */
+#include <linux/miscdevice.h>
/************************ CONSTANTS & MACROS ************************/
-/* /dev/irnet file constants */
-#define IRNET_MAJOR 10 /* Misc range */
-#define IRNET_MINOR 187 /* Official allocation */
-
/* IrNET control channel stuff */
#define IRNET_MAX_COMMAND 256 /* Max length of a command line */
@@ -111,9 +108,9 @@ static const struct file_operations irnet_device_fops =
/* Structure so that the misc major (drivers/char/misc.c) take care of us... */
static struct miscdevice irnet_misc_device =
{
- IRNET_MINOR,
- "irnet",
- &irnet_device_fops
+ .minor = IRNET_MINOR,
+ .name = "irnet",
+ .fops = &irnet_device_fops
};
#endif /* IRNET_PPP_H */
diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c
index e15c40e86660..7fc340e574cf 100644
--- a/net/irda/irnetlink.c
+++ b/net/irda/irnetlink.c
@@ -24,13 +24,7 @@
-static struct genl_family irda_nl_family = {
- .id = GENL_ID_GENERATE,
- .name = IRDA_NL_NAME,
- .hdrsize = 0,
- .version = IRDA_NL_VERSION,
- .maxattr = IRDA_NL_CMD_MAX,
-};
+static struct genl_family irda_nl_family;
static struct net_device * ifname_to_netdev(struct net *net, struct genl_info *info)
{
@@ -147,9 +141,19 @@ static const struct genl_ops irda_nl_ops[] = {
};
-int irda_nl_register(void)
+static struct genl_family irda_nl_family __ro_after_init = {
+ .name = IRDA_NL_NAME,
+ .hdrsize = 0,
+ .version = IRDA_NL_VERSION,
+ .maxattr = IRDA_NL_CMD_MAX,
+ .module = THIS_MODULE,
+ .ops = irda_nl_ops,
+ .n_ops = ARRAY_SIZE(irda_nl_ops),
+};
+
+int __init irda_nl_register(void)
{
- return genl_register_family_with_ops(&irda_nl_family, irda_nl_ops);
+ return genl_register_family(&irda_nl_family);
}
void irda_nl_unregister(void)
diff --git a/net/irda/irproc.c b/net/irda/irproc.c
index b9ac598e2116..77cfdde9d82f 100644
--- a/net/irda/irproc.c
+++ b/net/irda/irproc.c
@@ -23,7 +23,6 @@
*
********************************************************************/
-#include <linux/miscdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/module.h>
diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c
index acbe61c7e683..160dc89335e2 100644
--- a/net/irda/irqueue.c
+++ b/net/irda/irqueue.c
@@ -383,9 +383,6 @@ EXPORT_SYMBOL(hashbin_new);
* for deallocating this structure if it's complex. If not the user can
* just supply kfree, which should take care of the job.
*/
-#ifdef CONFIG_LOCKDEP
-static int hashbin_lock_depth = 0;
-#endif
int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
{
irda_queue_t* queue;
@@ -396,22 +393,27 @@ int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
IRDA_ASSERT(hashbin->magic == HB_MAGIC, return -1;);
/* Synchronize */
- if ( hashbin->hb_type & HB_LOCK ) {
- spin_lock_irqsave_nested(&hashbin->hb_spinlock, flags,
- hashbin_lock_depth++);
- }
+ if (hashbin->hb_type & HB_LOCK)
+ spin_lock_irqsave(&hashbin->hb_spinlock, flags);
/*
* Free the entries in the hashbin, TODO: use hashbin_clear when
* it has been shown to work
*/
for (i = 0; i < HASHBIN_SIZE; i ++ ) {
- queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]);
- while (queue ) {
- if (free_func)
- (*free_func)(queue);
- queue = dequeue_first(
- (irda_queue_t**) &hashbin->hb_queue[i]);
+ while (1) {
+ queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]);
+
+ if (!queue)
+ break;
+
+ if (free_func) {
+ if (hashbin->hb_type & HB_LOCK)
+ spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+ free_func(queue);
+ if (hashbin->hb_type & HB_LOCK)
+ spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+ }
}
}
@@ -420,12 +422,8 @@ int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
hashbin->magic = ~HB_MAGIC;
/* Release lock */
- if ( hashbin->hb_type & HB_LOCK) {
+ if (hashbin->hb_type & HB_LOCK)
spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
-#ifdef CONFIG_LOCKDEP
- hashbin_lock_depth--;
-#endif
- }
/*
* Free the hashbin structure
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 02b45a8e8b35..84de7b6326dc 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -17,7 +17,7 @@
#include <linux/list.h>
#include <linux/errno.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/skbuff.h>
#include <linux/init.h>
@@ -453,19 +453,27 @@ static void iucv_sever_path(struct sock *sk, int with_user_data)
}
}
-/* Send FIN through an IUCV socket for HIPER transport */
+/* Send controlling flags through an IUCV socket for HIPER transport */
static int iucv_send_ctrl(struct sock *sk, u8 flags)
{
int err = 0;
int blen;
struct sk_buff *skb;
+ u8 shutdown = 0;
blen = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN;
+ if (sk->sk_shutdown & SEND_SHUTDOWN) {
+ /* controlling flags should be sent anyway */
+ shutdown = sk->sk_shutdown;
+ sk->sk_shutdown &= RCV_SHUTDOWN;
+ }
skb = sock_alloc_send_skb(sk, blen, 1, &err);
if (skb) {
skb_reserve(skb, blen);
err = afiucv_hs_send(NULL, sk, skb, flags);
}
+ if (shutdown)
+ sk->sk_shutdown = shutdown;
return err;
}
@@ -930,7 +938,7 @@ done:
/* Accept a pending connection */
static int iucv_sock_accept(struct socket *sock, struct socket *newsock,
- int flags)
+ int flags, bool kern)
{
DECLARE_WAITQUEUE(wait, current);
struct sock *sk = sock->sk, *nsk;
@@ -1036,7 +1044,8 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
{
struct sock *sk = sock->sk;
struct iucv_sock *iucv = iucv_sk(sk);
- size_t headroom, linear;
+ size_t headroom = 0;
+ size_t linear;
struct sk_buff *skb;
struct iucv_message txmsg = {0};
struct cmsghdr *cmsg;
@@ -1114,18 +1123,20 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
* this is fine for SOCK_SEQPACKET (unless we want to support
* segmented records using the MSG_EOR flag), but
* for SOCK_STREAM we might want to improve it in future */
- headroom = (iucv->transport == AF_IUCV_TRANS_HIPER)
- ? sizeof(struct af_iucv_trans_hdr) + ETH_HLEN : 0;
- if (headroom + len < PAGE_SIZE) {
+ if (iucv->transport == AF_IUCV_TRANS_HIPER) {
+ headroom = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN;
linear = len;
} else {
- /* In nonlinear "classic" iucv skb,
- * reserve space for iucv_array
- */
- if (iucv->transport != AF_IUCV_TRANS_HIPER)
- headroom += sizeof(struct iucv_array) *
- (MAX_SKB_FRAGS + 1);
- linear = PAGE_SIZE - headroom;
+ if (len < PAGE_SIZE) {
+ linear = len;
+ } else {
+ /* In nonlinear "classic" iucv skb,
+ * reserve space for iucv_array
+ */
+ headroom = sizeof(struct iucv_array) *
+ (MAX_SKB_FRAGS + 1);
+ linear = PAGE_SIZE - headroom;
+ }
}
skb = sock_alloc_send_pskb(sk, headroom + linear, len - linear,
noblock, &err, 0);
@@ -1315,8 +1326,13 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb,
}
IUCV_SKB_CB(skb)->offset = 0;
- if (sock_queue_rcv_skb(sk, skb))
- skb_queue_head(&iucv_sk(sk)->backlog_skb_q, skb);
+ if (sk_filter(sk, skb)) {
+ atomic_inc(&sk->sk_drops); /* skb rejected by filter */
+ kfree_skb(skb);
+ return;
+ }
+ if (__sock_queue_rcv_skb(sk, skb)) /* handle rcv queue full */
+ skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, skb);
}
/* iucv_process_message_q() - Process outstanding IUCV messages
@@ -1430,13 +1446,13 @@ static int iucv_sock_recvmsg(struct socket *sock, struct msghdr *msg,
rskb = skb_dequeue(&iucv->backlog_skb_q);
while (rskb) {
IUCV_SKB_CB(rskb)->offset = 0;
- if (sock_queue_rcv_skb(sk, rskb)) {
+ if (__sock_queue_rcv_skb(sk, rskb)) {
+ /* handle rcv queue full */
skb_queue_head(&iucv->backlog_skb_q,
rskb);
break;
- } else {
- rskb = skb_dequeue(&iucv->backlog_skb_q);
}
+ rskb = skb_dequeue(&iucv->backlog_skb_q);
}
if (skb_queue_empty(&iucv->backlog_skb_q)) {
if (!list_empty(&iucv->message_q.list))
@@ -2116,12 +2132,17 @@ static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb)
skb_reset_transport_header(skb);
skb_reset_network_header(skb);
IUCV_SKB_CB(skb)->offset = 0;
+ if (sk_filter(sk, skb)) {
+ atomic_inc(&sk->sk_drops); /* skb rejected by filter */
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+ }
+
spin_lock(&iucv->message_q.lock);
if (skb_queue_empty(&iucv->backlog_skb_q)) {
- if (sock_queue_rcv_skb(sk, skb)) {
+ if (__sock_queue_rcv_skb(sk, skb))
/* handle rcv queue full */
skb_queue_tail(&iucv->backlog_skb_q, skb);
- }
} else
skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, skb);
spin_unlock(&iucv->message_q.lock);
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 88a2a3ba4212..8f7ef167c45a 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -639,7 +639,7 @@ static void iucv_disable(void)
put_online_cpus();
}
-static void free_iucv_data(int cpu)
+static int iucv_cpu_dead(unsigned int cpu)
{
kfree(iucv_param_irq[cpu]);
iucv_param_irq[cpu] = NULL;
@@ -647,9 +647,10 @@ static void free_iucv_data(int cpu)
iucv_param[cpu] = NULL;
kfree(iucv_irq_data[cpu]);
iucv_irq_data[cpu] = NULL;
+ return 0;
}
-static int alloc_iucv_data(int cpu)
+static int iucv_cpu_prepare(unsigned int cpu)
{
/* Note: GFP_DMA used to get memory below 2G */
iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data),
@@ -671,58 +672,38 @@ static int alloc_iucv_data(int cpu)
return 0;
out_free:
- free_iucv_data(cpu);
+ iucv_cpu_dead(cpu);
return -ENOMEM;
}
-static int iucv_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static int iucv_cpu_online(unsigned int cpu)
{
- cpumask_t cpumask;
- long cpu = (long) hcpu;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- if (alloc_iucv_data(cpu))
- return notifier_from_errno(-ENOMEM);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- free_iucv_data(cpu);
- break;
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- if (!iucv_path_table)
- break;
- smp_call_function_single(cpu, iucv_declare_cpu, NULL, 1);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- if (!iucv_path_table)
- break;
- cpumask_copy(&cpumask, &iucv_buffer_cpumask);
- cpumask_clear_cpu(cpu, &cpumask);
- if (cpumask_empty(&cpumask))
- /* Can't offline last IUCV enabled cpu. */
- return notifier_from_errno(-EINVAL);
- smp_call_function_single(cpu, iucv_retrieve_cpu, NULL, 1);
- if (cpumask_empty(&iucv_irq_cpumask))
- smp_call_function_single(
- cpumask_first(&iucv_buffer_cpumask),
- iucv_allow_cpu, NULL, 1);
- break;
- }
- return NOTIFY_OK;
+ if (!iucv_path_table)
+ return 0;
+ iucv_declare_cpu(NULL);
+ return 0;
}
-static struct notifier_block __refdata iucv_cpu_notifier = {
- .notifier_call = iucv_cpu_notify,
-};
+static int iucv_cpu_down_prep(unsigned int cpu)
+{
+ cpumask_t cpumask;
+
+ if (!iucv_path_table)
+ return 0;
+
+ cpumask_copy(&cpumask, &iucv_buffer_cpumask);
+ cpumask_clear_cpu(cpu, &cpumask);
+ if (cpumask_empty(&cpumask))
+ /* Can't offline last IUCV enabled cpu. */
+ return -EINVAL;
+
+ iucv_retrieve_cpu(NULL);
+ if (!cpumask_empty(&iucv_irq_cpumask))
+ return 0;
+ smp_call_function_single(cpumask_first(&iucv_buffer_cpumask),
+ iucv_allow_cpu, NULL, 1);
+ return 0;
+}
/**
* iucv_sever_pathid
@@ -2027,6 +2008,7 @@ struct iucv_interface iucv_if = {
};
EXPORT_SYMBOL(iucv_if);
+static enum cpuhp_state iucv_online;
/**
* iucv_init
*
@@ -2035,7 +2017,6 @@ EXPORT_SYMBOL(iucv_if);
static int __init iucv_init(void)
{
int rc;
- int cpu;
if (!MACHINE_IS_VM) {
rc = -EPROTONOSUPPORT;
@@ -2054,23 +2035,19 @@ static int __init iucv_init(void)
goto out_int;
}
- cpu_notifier_register_begin();
-
- for_each_online_cpu(cpu) {
- if (alloc_iucv_data(cpu)) {
- rc = -ENOMEM;
- goto out_free;
- }
- }
- rc = __register_hotcpu_notifier(&iucv_cpu_notifier);
+ rc = cpuhp_setup_state(CPUHP_NET_IUCV_PREPARE, "net/iucv:prepare",
+ iucv_cpu_prepare, iucv_cpu_dead);
if (rc)
- goto out_free;
-
- cpu_notifier_register_done();
+ goto out_dev;
+ rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "net/iucv:online",
+ iucv_cpu_online, iucv_cpu_down_prep);
+ if (rc < 0)
+ goto out_prep;
+ iucv_online = rc;
rc = register_reboot_notifier(&iucv_reboot_notifier);
if (rc)
- goto out_cpu;
+ goto out_remove_hp;
ASCEBC(iucv_error_no_listener, 16);
ASCEBC(iucv_error_no_memory, 16);
ASCEBC(iucv_error_pathid, 16);
@@ -2084,15 +2061,11 @@ static int __init iucv_init(void)
out_reboot:
unregister_reboot_notifier(&iucv_reboot_notifier);
-out_cpu:
- cpu_notifier_register_begin();
- __unregister_hotcpu_notifier(&iucv_cpu_notifier);
-out_free:
- for_each_possible_cpu(cpu)
- free_iucv_data(cpu);
-
- cpu_notifier_register_done();
-
+out_remove_hp:
+ cpuhp_remove_state(iucv_online);
+out_prep:
+ cpuhp_remove_state(CPUHP_NET_IUCV_PREPARE);
+out_dev:
root_device_unregister(iucv_root);
out_int:
unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
@@ -2110,7 +2083,6 @@ out:
static void __exit iucv_exit(void)
{
struct iucv_irq_list *p, *n;
- int cpu;
spin_lock_irq(&iucv_queue_lock);
list_for_each_entry_safe(p, n, &iucv_task_queue, list)
@@ -2119,11 +2091,9 @@ static void __exit iucv_exit(void)
kfree(p);
spin_unlock_irq(&iucv_queue_lock);
unregister_reboot_notifier(&iucv_reboot_notifier);
- cpu_notifier_register_begin();
- __unregister_hotcpu_notifier(&iucv_cpu_notifier);
- for_each_possible_cpu(cpu)
- free_iucv_data(cpu);
- cpu_notifier_register_done();
+
+ cpuhp_remove_state_nocalls(iucv_online);
+ cpuhp_remove_state(CPUHP_NET_IUCV_PREPARE);
root_device_unregister(iucv_root);
bus_unregister(&iucv_bus);
unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 7e08a4d3d77d..31762f76cdb5 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -24,6 +24,8 @@
#include <linux/uaccess.h>
#include <linux/workqueue.h>
#include <linux/syscalls.h>
+#include <linux/sched/signal.h>
+
#include <net/kcm.h>
#include <net/netns/generic.h>
#include <net/sock.h>
@@ -929,23 +931,25 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
goto out_error;
}
- /* New message, alloc head skb */
- head = alloc_skb(0, sk->sk_allocation);
- while (!head) {
- kcm_push(kcm);
- err = sk_stream_wait_memory(sk, &timeo);
- if (err)
- goto out_error;
-
+ if (msg_data_left(msg)) {
+ /* New message, alloc head skb */
head = alloc_skb(0, sk->sk_allocation);
- }
+ while (!head) {
+ kcm_push(kcm);
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err)
+ goto out_error;
+
+ head = alloc_skb(0, sk->sk_allocation);
+ }
- skb = head;
+ skb = head;
- /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
- * csum_and_copy_from_iter from skb_do_copy_data_nocache.
- */
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
+ * csum_and_copy_from_iter from skb_do_copy_data_nocache.
+ */
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
start:
while (msg_data_left(msg)) {
@@ -1018,10 +1022,12 @@ wait_for_memory:
if (eor) {
bool not_busy = skb_queue_empty(&sk->sk_write_queue);
- /* Message complete, queue it on send buffer */
- __skb_queue_tail(&sk->sk_write_queue, head);
- kcm->seq_skb = NULL;
- KCM_STATS_INCR(kcm->stats.tx_msgs);
+ if (head) {
+ /* Message complete, queue it on send buffer */
+ __skb_queue_tail(&sk->sk_write_queue, head);
+ kcm->seq_skb = NULL;
+ KCM_STATS_INCR(kcm->stats.tx_msgs);
+ }
if (msg->msg_flags & MSG_BATCH) {
kcm->tx_wait_more = true;
@@ -1040,8 +1046,10 @@ wait_for_memory:
} else {
/* Message not complete, save state */
partial_message:
- kcm->seq_skb = head;
- kcm_tx_msg(head)->last_skb = skb;
+ if (head) {
+ kcm->seq_skb = head;
+ kcm_tx_msg(head)->last_skb = skb;
+ }
}
KCM_STATS_ADD(kcm->stats.tx_bytes, copied);
@@ -1679,7 +1687,7 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
struct kcm_attach info;
if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
- err = -EFAULT;
+ return -EFAULT;
err = kcm_attach_ioctl(sock, &info);
@@ -1689,7 +1697,7 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
struct kcm_unattach info;
if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
- err = -EFAULT;
+ return -EFAULT;
err = kcm_unattach_ioctl(sock, &info);
@@ -1700,7 +1708,7 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
struct socket *newsock = NULL;
if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
- err = -EFAULT;
+ return -EFAULT;
err = kcm_clone(sock, &info, &newsock);
diff --git a/net/key/af_key.c b/net/key/af_key.c
index f9c9ecb0cdd3..be8cecc65002 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -36,7 +36,7 @@
#define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x))
#define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x))
-static int pfkey_net_id __read_mostly;
+static unsigned int pfkey_net_id __read_mostly;
struct netns_pfkey {
/* List of all pfkey sockets. */
struct hlist_head table;
@@ -63,8 +63,13 @@ struct pfkey_sock {
} u;
struct sk_buff *skb;
} dump;
+ struct mutex dump_lock;
};
+static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
+ xfrm_address_t *saddr, xfrm_address_t *daddr,
+ u16 *family);
+
static inline struct pfkey_sock *pfkey_sk(struct sock *sk)
{
return (struct pfkey_sock *)sk;
@@ -139,6 +144,7 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol,
{
struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
struct sock *sk;
+ struct pfkey_sock *pfk;
int err;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -153,6 +159,9 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol,
if (sk == NULL)
goto out;
+ pfk = pfkey_sk(sk);
+ mutex_init(&pfk->dump_lock);
+
sock->ops = &pfkey_ops;
sock_init_data(sock, sk);
@@ -281,13 +290,23 @@ static int pfkey_do_dump(struct pfkey_sock *pfk)
struct sadb_msg *hdr;
int rc;
+ mutex_lock(&pfk->dump_lock);
+ if (!pfk->dump.dump) {
+ rc = 0;
+ goto out;
+ }
+
rc = pfk->dump.dump(pfk);
- if (rc == -ENOBUFS)
- return 0;
+ if (rc == -ENOBUFS) {
+ rc = 0;
+ goto out;
+ }
if (pfk->dump.skb) {
- if (!pfkey_can_dump(&pfk->sk))
- return 0;
+ if (!pfkey_can_dump(&pfk->sk)) {
+ rc = 0;
+ goto out;
+ }
hdr = (struct sadb_msg *) pfk->dump.skb->data;
hdr->sadb_msg_seq = 0;
@@ -298,6 +317,9 @@ static int pfkey_do_dump(struct pfkey_sock *pfk)
}
pfkey_terminate_dump(pfk);
+
+out:
+ mutex_unlock(&pfk->dump_lock);
return rc;
}
@@ -1793,19 +1815,26 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms
struct xfrm_address_filter *filter = NULL;
struct pfkey_sock *pfk = pfkey_sk(sk);
- if (pfk->dump.dump != NULL)
+ mutex_lock(&pfk->dump_lock);
+ if (pfk->dump.dump != NULL) {
+ mutex_unlock(&pfk->dump_lock);
return -EBUSY;
+ }
proto = pfkey_satype2proto(hdr->sadb_msg_satype);
- if (proto == 0)
+ if (proto == 0) {
+ mutex_unlock(&pfk->dump_lock);
return -EINVAL;
+ }
if (ext_hdrs[SADB_X_EXT_FILTER - 1]) {
struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1];
filter = kmalloc(sizeof(*filter), GFP_KERNEL);
- if (filter == NULL)
+ if (filter == NULL) {
+ mutex_unlock(&pfk->dump_lock);
return -ENOMEM;
+ }
memcpy(&filter->saddr, &xfilter->sadb_x_filter_saddr,
sizeof(xfrm_address_t));
@@ -1821,6 +1850,7 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms
pfk->dump.dump = pfkey_dump_sa;
pfk->dump.done = pfkey_dump_sa_done;
xfrm_state_walk_init(&pfk->dump.u.state, proto, filter);
+ mutex_unlock(&pfk->dump_lock);
return pfkey_do_dump(pfk);
}
@@ -1913,19 +1943,14 @@ parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq)
/* addresses present only in tunnel mode */
if (t->mode == XFRM_MODE_TUNNEL) {
- u8 *sa = (u8 *) (rq + 1);
- int family, socklen;
+ int err;
- family = pfkey_sockaddr_extract((struct sockaddr *)sa,
- &t->saddr);
- if (!family)
- return -EINVAL;
-
- socklen = pfkey_sockaddr_len(family);
- if (pfkey_sockaddr_extract((struct sockaddr *)(sa + socklen),
- &t->id.daddr) != family)
- return -EINVAL;
- t->encap_family = family;
+ err = parse_sockaddr_pair(
+ (struct sockaddr *)(rq + 1),
+ rq->sadb_x_ipsecrequest_len - sizeof(*rq),
+ &t->saddr, &t->id.daddr, &t->encap_family);
+ if (err)
+ return err;
} else
t->encap_family = xp->family;
@@ -1945,7 +1970,11 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
if (pol->sadb_x_policy_len * 8 < sizeof(struct sadb_x_policy))
return -EINVAL;
- while (len >= sizeof(struct sadb_x_ipsecrequest)) {
+ while (len >= sizeof(*rq)) {
+ if (len < rq->sadb_x_ipsecrequest_len ||
+ rq->sadb_x_ipsecrequest_len < sizeof(*rq))
+ return -EINVAL;
+
if ((err = parse_ipsecrequest(xp, rq)) < 0)
return err;
len -= rq->sadb_x_ipsecrequest_len;
@@ -2408,7 +2437,6 @@ out:
return err;
}
-#ifdef CONFIG_NET_KEY_MIGRATE
static int pfkey_sockaddr_pair_size(sa_family_t family)
{
return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2);
@@ -2420,7 +2448,7 @@ static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
{
int af, socklen;
- if (ext_len < pfkey_sockaddr_pair_size(sa->sa_family))
+ if (ext_len < 2 || ext_len < pfkey_sockaddr_pair_size(sa->sa_family))
return -EINVAL;
af = pfkey_sockaddr_extract(sa, saddr);
@@ -2436,6 +2464,7 @@ static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
return 0;
}
+#ifdef CONFIG_NET_KEY_MIGRATE
static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
struct xfrm_migrate *m)
{
@@ -2443,13 +2472,14 @@ static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
struct sadb_x_ipsecrequest *rq2;
int mode;
- if (len <= sizeof(struct sadb_x_ipsecrequest) ||
- len < rq1->sadb_x_ipsecrequest_len)
+ if (len < sizeof(*rq1) ||
+ len < rq1->sadb_x_ipsecrequest_len ||
+ rq1->sadb_x_ipsecrequest_len < sizeof(*rq1))
return -EINVAL;
/* old endoints */
err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1),
- rq1->sadb_x_ipsecrequest_len,
+ rq1->sadb_x_ipsecrequest_len - sizeof(*rq1),
&m->old_saddr, &m->old_daddr,
&m->old_family);
if (err)
@@ -2458,13 +2488,14 @@ static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
rq2 = (struct sadb_x_ipsecrequest *)((u8 *)rq1 + rq1->sadb_x_ipsecrequest_len);
len -= rq1->sadb_x_ipsecrequest_len;
- if (len <= sizeof(struct sadb_x_ipsecrequest) ||
- len < rq2->sadb_x_ipsecrequest_len)
+ if (len <= sizeof(*rq2) ||
+ len < rq2->sadb_x_ipsecrequest_len ||
+ rq2->sadb_x_ipsecrequest_len < sizeof(*rq2))
return -EINVAL;
/* new endpoints */
err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1),
- rq2->sadb_x_ipsecrequest_len,
+ rq2->sadb_x_ipsecrequest_len - sizeof(*rq2),
&m->new_saddr, &m->new_daddr,
&m->new_family);
if (err)
@@ -2679,14 +2710,18 @@ static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, const struct sadb
{
struct pfkey_sock *pfk = pfkey_sk(sk);
- if (pfk->dump.dump != NULL)
+ mutex_lock(&pfk->dump_lock);
+ if (pfk->dump.dump != NULL) {
+ mutex_unlock(&pfk->dump_lock);
return -EBUSY;
+ }
pfk->dump.msg_version = hdr->sadb_msg_version;
pfk->dump.msg_portid = hdr->sadb_msg_pid;
pfk->dump.dump = pfkey_dump_sp;
pfk->dump.done = pfkey_dump_sp_done;
xfrm_policy_walk_init(&pfk->dump.u.policy, XFRM_POLICY_TYPE_MAIN);
+ mutex_unlock(&pfk->dump_lock);
return pfkey_do_dump(pfk);
}
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index a2ed3bda4ddc..e37d9554da7b 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -278,7 +278,57 @@ struct l2tp_session *l2tp_session_find(struct net *net, struct l2tp_tunnel *tunn
}
EXPORT_SYMBOL_GPL(l2tp_session_find);
-struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth)
+/* Like l2tp_session_find() but takes a reference on the returned session.
+ * Optionally calls session->ref() too if do_ref is true.
+ */
+struct l2tp_session *l2tp_session_get(struct net *net,
+ struct l2tp_tunnel *tunnel,
+ u32 session_id, bool do_ref)
+{
+ struct hlist_head *session_list;
+ struct l2tp_session *session;
+
+ if (!tunnel) {
+ struct l2tp_net *pn = l2tp_pernet(net);
+
+ session_list = l2tp_session_id_hash_2(pn, session_id);
+
+ rcu_read_lock_bh();
+ hlist_for_each_entry_rcu(session, session_list, global_hlist) {
+ if (session->session_id == session_id) {
+ l2tp_session_inc_refcount(session);
+ if (do_ref && session->ref)
+ session->ref(session);
+ rcu_read_unlock_bh();
+
+ return session;
+ }
+ }
+ rcu_read_unlock_bh();
+
+ return NULL;
+ }
+
+ session_list = l2tp_session_id_hash(tunnel, session_id);
+ read_lock_bh(&tunnel->hlist_lock);
+ hlist_for_each_entry(session, session_list, hlist) {
+ if (session->session_id == session_id) {
+ l2tp_session_inc_refcount(session);
+ if (do_ref && session->ref)
+ session->ref(session);
+ read_unlock_bh(&tunnel->hlist_lock);
+
+ return session;
+ }
+ }
+ read_unlock_bh(&tunnel->hlist_lock);
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(l2tp_session_get);
+
+struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth,
+ bool do_ref)
{
int hash;
struct l2tp_session *session;
@@ -288,6 +338,9 @@ struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth)
for (hash = 0; hash < L2TP_HASH_SIZE; hash++) {
hlist_for_each_entry(session, &tunnel->session_hlist[hash], hlist) {
if (++count > nth) {
+ l2tp_session_inc_refcount(session);
+ if (do_ref && session->ref)
+ session->ref(session);
read_unlock_bh(&tunnel->hlist_lock);
return session;
}
@@ -298,12 +351,13 @@ struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth)
return NULL;
}
-EXPORT_SYMBOL_GPL(l2tp_session_find_nth);
+EXPORT_SYMBOL_GPL(l2tp_session_get_nth);
/* Lookup a session by interface name.
* This is very inefficient but is only used by management interfaces.
*/
-struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname)
+struct l2tp_session *l2tp_session_get_by_ifname(struct net *net, char *ifname,
+ bool do_ref)
{
struct l2tp_net *pn = l2tp_pernet(net);
int hash;
@@ -313,7 +367,11 @@ struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname)
for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) {
hlist_for_each_entry_rcu(session, &pn->l2tp_session_hlist[hash], global_hlist) {
if (!strcmp(session->ifname, ifname)) {
+ l2tp_session_inc_refcount(session);
+ if (do_ref && session->ref)
+ session->ref(session);
rcu_read_unlock_bh();
+
return session;
}
}
@@ -323,7 +381,49 @@ struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname)
return NULL;
}
-EXPORT_SYMBOL_GPL(l2tp_session_find_by_ifname);
+EXPORT_SYMBOL_GPL(l2tp_session_get_by_ifname);
+
+static int l2tp_session_add_to_tunnel(struct l2tp_tunnel *tunnel,
+ struct l2tp_session *session)
+{
+ struct l2tp_session *session_walk;
+ struct hlist_head *g_head;
+ struct hlist_head *head;
+ struct l2tp_net *pn;
+
+ head = l2tp_session_id_hash(tunnel, session->session_id);
+
+ write_lock_bh(&tunnel->hlist_lock);
+ hlist_for_each_entry(session_walk, head, hlist)
+ if (session_walk->session_id == session->session_id)
+ goto exist;
+
+ if (tunnel->version == L2TP_HDR_VER_3) {
+ pn = l2tp_pernet(tunnel->l2tp_net);
+ g_head = l2tp_session_id_hash_2(l2tp_pernet(tunnel->l2tp_net),
+ session->session_id);
+
+ spin_lock_bh(&pn->l2tp_session_hlist_lock);
+ hlist_for_each_entry(session_walk, g_head, global_hlist)
+ if (session_walk->session_id == session->session_id)
+ goto exist_glob;
+
+ hlist_add_head_rcu(&session->global_hlist, g_head);
+ spin_unlock_bh(&pn->l2tp_session_hlist_lock);
+ }
+
+ hlist_add_head(&session->hlist, head);
+ write_unlock_bh(&tunnel->hlist_lock);
+
+ return 0;
+
+exist_glob:
+ spin_unlock_bh(&pn->l2tp_session_hlist_lock);
+exist:
+ write_unlock_bh(&tunnel->hlist_lock);
+
+ return -EEXIST;
+}
/* Lookup a tunnel by id
*/
@@ -633,6 +733,9 @@ discard:
* a data (not control) frame before coming here. Fields up to the
* session-id have already been parsed and ptr points to the data
* after the session-id.
+ *
+ * session->ref() must have been called prior to l2tp_recv_common().
+ * session->deref() will be called automatically after skb is processed.
*/
void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
unsigned char *ptr, unsigned char *optr, u16 hdrflags,
@@ -642,14 +745,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
int offset;
u32 ns, nr;
- /* The ref count is increased since we now hold a pointer to
- * the session. Take care to decrement the refcnt when exiting
- * this function from now on...
- */
- l2tp_session_inc_refcount(session);
- if (session->ref)
- (*session->ref)(session);
-
/* Parse and check optional cookie */
if (session->peer_cookie_len > 0) {
if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) {
@@ -715,7 +810,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
l2tp_info(session, L2TP_MSG_SEQ,
"%s: requested to enable seq numbers by LNS\n",
session->name);
- session->send_seq = -1;
+ session->send_seq = 1;
l2tp_session_set_header_len(session, tunnel->version);
}
} else {
@@ -802,8 +897,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
/* Try to dequeue as many skbs from reorder_q as we can. */
l2tp_recv_dequeue(session);
- l2tp_session_dec_refcount(session);
-
return;
discard:
@@ -812,8 +905,6 @@ discard:
if (session->deref)
(*session->deref)(session);
-
- l2tp_session_dec_refcount(session);
}
EXPORT_SYMBOL(l2tp_recv_common);
@@ -920,8 +1011,14 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
}
/* Find the session context */
- session = l2tp_session_find(tunnel->l2tp_net, tunnel, session_id);
+ session = l2tp_session_get(tunnel->l2tp_net, tunnel, session_id, true);
if (!session || !session->recv_skb) {
+ if (session) {
+ if (session->deref)
+ session->deref(session);
+ l2tp_session_dec_refcount(session);
+ }
+
/* Not found? Pass to userspace to deal with */
l2tp_info(tunnel, L2TP_MSG_DATA,
"%s: no session found (%u/%u). Passing up.\n",
@@ -930,6 +1027,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
}
l2tp_recv_common(session, skb, ptr, optr, hdrflags, length, payload_hook);
+ l2tp_session_dec_refcount(session);
return 0;
@@ -1058,10 +1156,10 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
/* Debug */
if (session->send_seq)
- l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %Zd bytes, ns=%u\n",
+ l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes, ns=%u\n",
session->name, data_len, session->ns - 1);
else
- l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %Zd bytes\n",
+ l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes\n",
session->name, data_len);
if (session->debug & L2TP_MSG_DATA) {
@@ -1317,6 +1415,9 @@ static void l2tp_tunnel_del_work(struct work_struct *work)
struct sock *sk = NULL;
tunnel = container_of(work, struct l2tp_tunnel, del_work);
+
+ l2tp_tunnel_closeall(tunnel);
+
sk = l2tp_tunnel_sock_lookup(tunnel);
if (!sk)
goto out;
@@ -1639,7 +1740,6 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create);
int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
{
l2tp_tunnel_inc_refcount(tunnel);
- l2tp_tunnel_closeall(tunnel);
if (false == queue_work(l2tp_wq, &tunnel->del_work)) {
l2tp_tunnel_dec_refcount(tunnel);
return 1;
@@ -1736,6 +1836,7 @@ EXPORT_SYMBOL_GPL(l2tp_session_set_header_len);
struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg)
{
struct l2tp_session *session;
+ int err;
session = kzalloc(sizeof(struct l2tp_session) + priv_size, GFP_KERNEL);
if (session != NULL) {
@@ -1791,6 +1892,13 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
l2tp_session_set_header_len(session, tunnel->version);
+ err = l2tp_session_add_to_tunnel(tunnel, session);
+ if (err) {
+ kfree(session);
+
+ return ERR_PTR(err);
+ }
+
/* Bump the reference count. The session context is deleted
* only when this drops to zero.
*/
@@ -1800,28 +1908,14 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
/* Ensure tunnel socket isn't deleted */
sock_hold(tunnel->sock);
- /* Add session to the tunnel's hash list */
- write_lock_bh(&tunnel->hlist_lock);
- hlist_add_head(&session->hlist,
- l2tp_session_id_hash(tunnel, session_id));
- write_unlock_bh(&tunnel->hlist_lock);
-
- /* And to the global session list if L2TPv3 */
- if (tunnel->version != L2TP_HDR_VER_2) {
- struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
-
- spin_lock_bh(&pn->l2tp_session_hlist_lock);
- hlist_add_head_rcu(&session->global_hlist,
- l2tp_session_id_hash_2(pn, session_id));
- spin_unlock_bh(&pn->l2tp_session_hlist_lock);
- }
-
/* Ignore management session in session count value */
if (session->session_id != 0)
atomic_inc(&l2tp_session_count);
+
+ return session;
}
- return session;
+ return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL_GPL(l2tp_session_create);
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 2599af6378e4..8ce7818c7a9d 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -23,16 +23,6 @@
#define L2TP_HASH_BITS_2 8
#define L2TP_HASH_SIZE_2 (1 << L2TP_HASH_BITS_2)
-/* Debug message categories for the DEBUG socket option */
-enum {
- L2TP_MSG_DEBUG = (1 << 0), /* verbose debug (if
- * compiled in) */
- L2TP_MSG_CONTROL = (1 << 1), /* userspace - kernel
- * interface */
- L2TP_MSG_SEQ = (1 << 2), /* sequence numbers */
- L2TP_MSG_DATA = (1 << 3), /* data packets */
-};
-
struct sk_buff;
struct l2tp_stats {
@@ -240,11 +230,16 @@ out:
return tunnel;
}
+struct l2tp_session *l2tp_session_get(struct net *net,
+ struct l2tp_tunnel *tunnel,
+ u32 session_id, bool do_ref);
struct l2tp_session *l2tp_session_find(struct net *net,
struct l2tp_tunnel *tunnel,
u32 session_id);
-struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth);
-struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname);
+struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth,
+ bool do_ref);
+struct l2tp_session *l2tp_session_get_by_ifname(struct net *net, char *ifname,
+ bool do_ref);
struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id);
struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth);
@@ -273,6 +268,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb,
int l2tp_nl_register_ops(enum l2tp_pwtype pw_type,
const struct l2tp_nl_cmd_ops *ops);
void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type);
+int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg);
/* Session reference counts. Incremented when code obtains a reference
* to a session.
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 2d6760a2ae34..d100aed3d06f 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -53,7 +53,7 @@ static void l2tp_dfs_next_tunnel(struct l2tp_dfs_seq_data *pd)
static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd)
{
- pd->session = l2tp_session_find_nth(pd->tunnel, pd->session_idx);
+ pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx, true);
pd->session_idx++;
if (pd->session == NULL) {
@@ -238,10 +238,14 @@ static int l2tp_dfs_seq_show(struct seq_file *m, void *v)
}
/* Show the tunnel or session context */
- if (pd->session == NULL)
+ if (!pd->session) {
l2tp_dfs_seq_tunnel_show(m, pd->tunnel);
- else
+ } else {
l2tp_dfs_seq_session_show(m, pd->session);
+ if (pd->session->deref)
+ pd->session->deref(pd->session);
+ l2tp_session_dec_refcount(pd->session);
+ }
out:
return 0;
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 965f7e344cef..6fd41d7afe1e 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -106,8 +106,8 @@ static int l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
}
-static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *stats)
+static void l2tp_eth_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *stats)
{
struct l2tp_eth *priv = netdev_priv(dev);
@@ -117,10 +117,8 @@ static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev,
stats->rx_bytes = atomic_long_read(&priv->rx_bytes);
stats->rx_packets = atomic_long_read(&priv->rx_packets);
stats->rx_errors = atomic_long_read(&priv->rx_errors);
- return stats;
}
-
static const struct net_device_ops l2tp_eth_netdev_ops = {
.ndo_init = l2tp_eth_dev_init,
.ndo_uninit = l2tp_eth_dev_uninit,
@@ -223,12 +221,6 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p
goto out;
}
- session = l2tp_session_find(net, tunnel, session_id);
- if (session) {
- rc = -EEXIST;
- goto out;
- }
-
if (cfg->ifname) {
dev = dev_get_by_name(net, cfg->ifname);
if (dev) {
@@ -242,8 +234,8 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p
session = l2tp_session_create(sizeof(*spriv), tunnel, session_id,
peer_session_id, cfg);
- if (!session) {
- rc = -ENOMEM;
+ if (IS_ERR(session)) {
+ rc = PTR_ERR(session);
goto out;
}
@@ -259,6 +251,8 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p
session->mtu = dev->mtu - session->hdr_len;
dev->mtu = session->mtu;
dev->needed_headroom += session->hdr_len;
+ dev->min_mtu = 0;
+ dev->max_mtu = ETH_MAX_MTU;
priv = netdev_priv(dev);
priv->dev = dev;
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 8938b6ba57a0..4d322c1b7233 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -11,6 +11,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <asm/ioctls.h>
#include <linux/icmp.h>
#include <linux/module.h>
#include <linux/skbuff.h>
@@ -47,23 +48,32 @@ static inline struct l2tp_ip_sock *l2tp_ip_sk(const struct sock *sk)
return (struct l2tp_ip_sock *)sk;
}
-static struct sock *__l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif, u32 tunnel_id)
+static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr,
+ __be32 raddr, int dif, u32 tunnel_id)
{
struct sock *sk;
sk_for_each_bound(sk, &l2tp_ip_bind_table) {
- struct inet_sock *inet = inet_sk(sk);
- struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk);
+ const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
- if (l2tp == NULL)
+ if (!net_eq(sock_net(sk), net))
continue;
- if ((l2tp->conn_id == tunnel_id) &&
- net_eq(sock_net(sk), net) &&
- !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
- (!sk->sk_bound_dev_if || !dif ||
- sk->sk_bound_dev_if == dif))
- goto found;
+ if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif)
+ continue;
+
+ if (inet->inet_rcv_saddr && laddr &&
+ inet->inet_rcv_saddr != laddr)
+ continue;
+
+ if (inet->inet_daddr && raddr && inet->inet_daddr != raddr)
+ continue;
+
+ if (l2tp->conn_id != tunnel_id)
+ continue;
+
+ goto found;
}
sk = NULL;
@@ -71,15 +81,6 @@ found:
return sk;
}
-static inline struct sock *l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif, u32 tunnel_id)
-{
- struct sock *sk = __l2tp_ip_bind_lookup(net, laddr, dif, tunnel_id);
- if (sk)
- sock_hold(sk);
-
- return sk;
-}
-
/* When processing receive frames, there are two cases to
* consider. Data frames consist of a non-zero session-id and an
* optional cookie. Control frames consist of a regular L2TP header
@@ -142,19 +143,19 @@ static int l2tp_ip_recv(struct sk_buff *skb)
}
/* Ok, this is a data packet. Lookup the session. */
- session = l2tp_session_find(net, NULL, session_id);
- if (session == NULL)
+ session = l2tp_session_get(net, NULL, session_id, true);
+ if (!session)
goto discard;
tunnel = session->tunnel;
- if (tunnel == NULL)
- goto discard;
+ if (!tunnel)
+ goto discard_sess;
/* Trace packet contents, if enabled */
if (tunnel->debug & L2TP_MSG_DATA) {
length = min(32u, skb->len);
if (!pskb_may_pull(skb, length))
- goto discard;
+ goto discard_sess;
/* Point to L2TP header */
optr = ptr = skb->data;
@@ -164,6 +165,7 @@ static int l2tp_ip_recv(struct sk_buff *skb)
}
l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, tunnel->recv_payload_hook);
+ l2tp_session_dec_refcount(session);
return 0;
@@ -177,14 +179,15 @@ pass_up:
tunnel_id = ntohl(*(__be32 *) &skb->data[4]);
tunnel = l2tp_tunnel_find(net, tunnel_id);
- if (tunnel != NULL)
+ if (tunnel) {
sk = tunnel->sock;
- else {
+ sock_hold(sk);
+ } else {
struct iphdr *iph = (struct iphdr *) skb_network_header(skb);
read_lock_bh(&l2tp_ip_lock);
- sk = __l2tp_ip_bind_lookup(net, iph->daddr, inet_iif(skb),
- tunnel_id);
+ sk = __l2tp_ip_bind_lookup(net, iph->daddr, iph->saddr,
+ inet_iif(skb), tunnel_id);
if (!sk) {
read_unlock_bh(&l2tp_ip_lock);
goto discard;
@@ -201,6 +204,12 @@ pass_up:
return sk_receive_skb(sk, skb, 1);
+discard_sess:
+ if (session->deref)
+ session->deref(session);
+ l2tp_session_dec_refcount(session);
+ goto discard;
+
discard_put:
sock_put(sk);
@@ -265,7 +274,7 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (!sock_flag(sk, SOCK_ZAPPED))
goto out;
- if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip))
+ if (sk->sk_state != TCP_CLOSE)
goto out;
chk_addr_ret = inet_addr_type(net, addr->l2tp_addr.s_addr);
@@ -280,7 +289,7 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_saddr = 0; /* Use device */
write_lock_bh(&l2tp_ip_lock);
- if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr,
+ if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, 0,
sk->sk_bound_dev_if, addr->l2tp_conn_id)) {
write_unlock_bh(&l2tp_ip_lock);
ret = -EADDRINUSE;
@@ -387,7 +396,7 @@ static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb)
drop:
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS);
kfree_skb(skb);
- return -1;
+ return 0;
}
/* Userspace will call sendmsg() on the tunnel socket to send L2TP
@@ -560,6 +569,30 @@ out:
return err ? err : copied;
}
+int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ struct sk_buff *skb;
+ int amount;
+
+ switch (cmd) {
+ case SIOCOUTQ:
+ amount = sk_wmem_alloc_get(sk);
+ break;
+ case SIOCINQ:
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ amount = skb ? skb->len : 0;
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ break;
+
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return put_user(amount, (int __user *)arg);
+}
+EXPORT_SYMBOL(l2tp_ioctl);
+
static struct proto l2tp_ip_prot = {
.name = "L2TP/IP",
.owner = THIS_MODULE,
@@ -568,7 +601,7 @@ static struct proto l2tp_ip_prot = {
.bind = l2tp_ip_bind,
.connect = l2tp_ip_connect,
.disconnect = l2tp_ip_disconnect,
- .ioctl = udp_ioctl,
+ .ioctl = l2tp_ioctl,
.destroy = l2tp_ip_destroy_sock,
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index aa821cb639e5..88b397c30d86 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -57,25 +57,36 @@ static inline struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk)
return (struct l2tp_ip6_sock *)sk;
}
-static struct sock *__l2tp_ip6_bind_lookup(struct net *net,
- struct in6_addr *laddr,
+static struct sock *__l2tp_ip6_bind_lookup(const struct net *net,
+ const struct in6_addr *laddr,
+ const struct in6_addr *raddr,
int dif, u32 tunnel_id)
{
struct sock *sk;
sk_for_each_bound(sk, &l2tp_ip6_bind_table) {
- const struct in6_addr *addr = inet6_rcv_saddr(sk);
- struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk);
+ const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk);
+ const struct in6_addr *sk_raddr = &sk->sk_v6_daddr;
+ const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk);
- if (l2tp == NULL)
+ if (!net_eq(sock_net(sk), net))
continue;
- if ((l2tp->conn_id == tunnel_id) &&
- net_eq(sock_net(sk), net) &&
- (!addr || ipv6_addr_equal(addr, laddr)) &&
- (!sk->sk_bound_dev_if || !dif ||
- sk->sk_bound_dev_if == dif))
- goto found;
+ if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif)
+ continue;
+
+ if (sk_laddr && !ipv6_addr_any(sk_laddr) &&
+ !ipv6_addr_any(laddr) && !ipv6_addr_equal(sk_laddr, laddr))
+ continue;
+
+ if (!ipv6_addr_any(sk_raddr) && raddr &&
+ !ipv6_addr_any(raddr) && !ipv6_addr_equal(sk_raddr, raddr))
+ continue;
+
+ if (l2tp->conn_id != tunnel_id)
+ continue;
+
+ goto found;
}
sk = NULL;
@@ -83,17 +94,6 @@ found:
return sk;
}
-static inline struct sock *l2tp_ip6_bind_lookup(struct net *net,
- struct in6_addr *laddr,
- int dif, u32 tunnel_id)
-{
- struct sock *sk = __l2tp_ip6_bind_lookup(net, laddr, dif, tunnel_id);
- if (sk)
- sock_hold(sk);
-
- return sk;
-}
-
/* When processing receive frames, there are two cases to
* consider. Data frames consist of a non-zero session-id and an
* optional cookie. Control frames consist of a regular L2TP header
@@ -156,19 +156,19 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
}
/* Ok, this is a data packet. Lookup the session. */
- session = l2tp_session_find(net, NULL, session_id);
- if (session == NULL)
+ session = l2tp_session_get(net, NULL, session_id, true);
+ if (!session)
goto discard;
tunnel = session->tunnel;
- if (tunnel == NULL)
- goto discard;
+ if (!tunnel)
+ goto discard_sess;
/* Trace packet contents, if enabled */
if (tunnel->debug & L2TP_MSG_DATA) {
length = min(32u, skb->len);
if (!pskb_may_pull(skb, length))
- goto discard;
+ goto discard_sess;
/* Point to L2TP header */
optr = ptr = skb->data;
@@ -179,6 +179,8 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
l2tp_recv_common(session, skb, ptr, optr, 0, skb->len,
tunnel->recv_payload_hook);
+ l2tp_session_dec_refcount(session);
+
return 0;
pass_up:
@@ -191,14 +193,15 @@ pass_up:
tunnel_id = ntohl(*(__be32 *) &skb->data[4]);
tunnel = l2tp_tunnel_find(net, tunnel_id);
- if (tunnel != NULL)
+ if (tunnel) {
sk = tunnel->sock;
- else {
+ sock_hold(sk);
+ } else {
struct ipv6hdr *iph = ipv6_hdr(skb);
read_lock_bh(&l2tp_ip6_lock);
- sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, inet6_iif(skb),
- tunnel_id);
+ sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, &iph->saddr,
+ inet6_iif(skb), tunnel_id);
if (!sk) {
read_unlock_bh(&l2tp_ip6_lock);
goto discard;
@@ -215,6 +218,12 @@ pass_up:
return sk_receive_skb(sk, skb, 1);
+discard_sess:
+ if (session->deref)
+ session->deref(session);
+ l2tp_session_dec_refcount(session);
+ goto discard;
+
discard_put:
sock_put(sk);
@@ -330,7 +339,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
rcu_read_unlock();
write_lock_bh(&l2tp_ip6_lock);
- if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, bound_dev_if,
+ if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, NULL, bound_dev_if,
addr->l2tp_conn_id)) {
write_unlock_bh(&l2tp_ip6_lock);
err = -EADDRINUSE;
@@ -525,6 +534,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
ipc6.hlimit = -1;
ipc6.tclass = -1;
@@ -657,7 +667,8 @@ out:
return err < 0 ? err : len;
do_confirm:
- dst_confirm(dst);
+ if (msg->msg_flags & MSG_PROBE)
+ dst_confirm_neigh(dst, &fl6.daddr);
if (!(msg->msg_flags & MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
@@ -729,7 +740,7 @@ static struct proto l2tp_ip6_prot = {
.bind = l2tp_ip6_bind,
.connect = l2tp_ip6_connect,
.disconnect = l2tp_ip6_disconnect,
- .ioctl = udp_ioctl,
+ .ioctl = l2tp_ioctl,
.destroy = l2tp_ip6_destroy_sock,
.setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt,
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index bf3117771822..7e3e669baac4 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -31,14 +31,7 @@
#include "l2tp_core.h"
-static struct genl_family l2tp_nl_family = {
- .id = GENL_ID_GENERATE,
- .name = L2TP_GENL_NAME,
- .version = L2TP_GENL_VERSION,
- .hdrsize = 0,
- .maxattr = L2TP_ATTR_MAX,
- .netnsok = true,
-};
+static struct genl_family l2tp_nl_family;
static const struct genl_multicast_group l2tp_multicast_group[] = {
{
@@ -55,7 +48,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq,
/* Accessed under genl lock */
static const struct l2tp_nl_cmd_ops *l2tp_nl_cmd_ops[__L2TP_PWTYPE_MAX];
-static struct l2tp_session *l2tp_nl_session_find(struct genl_info *info)
+static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info,
+ bool do_ref)
{
u32 tunnel_id;
u32 session_id;
@@ -66,14 +60,15 @@ static struct l2tp_session *l2tp_nl_session_find(struct genl_info *info)
if (info->attrs[L2TP_ATTR_IFNAME]) {
ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]);
- session = l2tp_session_find_by_ifname(net, ifname);
+ session = l2tp_session_get_by_ifname(net, ifname, do_ref);
} else if ((info->attrs[L2TP_ATTR_SESSION_ID]) &&
(info->attrs[L2TP_ATTR_CONN_ID])) {
tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]);
session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]);
tunnel = l2tp_tunnel_find(net, tunnel_id);
if (tunnel)
- session = l2tp_session_find(net, tunnel, session_id);
+ session = l2tp_session_get(net, tunnel, session_id,
+ do_ref);
}
return session;
@@ -227,14 +222,14 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info
cfg.local_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_SPORT]);
if (info->attrs[L2TP_ATTR_UDP_DPORT])
cfg.peer_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_DPORT]);
- if (info->attrs[L2TP_ATTR_UDP_CSUM])
- cfg.use_udp_checksums = nla_get_flag(info->attrs[L2TP_ATTR_UDP_CSUM]);
+ cfg.use_udp_checksums = nla_get_flag(
+ info->attrs[L2TP_ATTR_UDP_CSUM]);
#if IS_ENABLED(CONFIG_IPV6)
- if (info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX])
- cfg.udp6_zero_tx_checksums = nla_get_flag(info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]);
- if (info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX])
- cfg.udp6_zero_rx_checksums = nla_get_flag(info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]);
+ cfg.udp6_zero_tx_checksums = nla_get_flag(
+ info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]);
+ cfg.udp6_zero_rx_checksums = nla_get_flag(
+ info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]);
#endif
}
@@ -386,9 +381,24 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla
switch (tunnel->encap) {
case L2TP_ENCAPTYPE_UDP:
+ switch (sk->sk_family) {
+ case AF_INET:
+ if (nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx))
+ goto nla_put_failure;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ if (udp_get_no_check6_tx(sk) &&
+ nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_TX))
+ goto nla_put_failure;
+ if (udp_get_no_check6_rx(sk) &&
+ nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_RX))
+ goto nla_put_failure;
+ break;
+#endif
+ }
if (nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) ||
- nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport)) ||
- nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx))
+ nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport)))
goto nla_put_failure;
/* NOBREAK */
case L2TP_ENCAPTYPE_IP:
@@ -634,10 +644,12 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
session_id, peer_session_id, &cfg);
if (ret >= 0) {
- session = l2tp_session_find(net, tunnel, session_id);
- if (session)
+ session = l2tp_session_get(net, tunnel, session_id, false);
+ if (session) {
ret = l2tp_session_notify(&l2tp_nl_family, info, session,
L2TP_CMD_SESSION_CREATE);
+ l2tp_session_dec_refcount(session);
+ }
}
out:
@@ -650,7 +662,7 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf
struct l2tp_session *session;
u16 pw_type;
- session = l2tp_nl_session_find(info);
+ session = l2tp_nl_session_get(info, true);
if (session == NULL) {
ret = -ENODEV;
goto out;
@@ -664,6 +676,10 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf
if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete)
ret = (*l2tp_nl_cmd_ops[pw_type]->session_delete)(session);
+ if (session->deref)
+ session->deref(session);
+ l2tp_session_dec_refcount(session);
+
out:
return ret;
}
@@ -673,7 +689,7 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
int ret = 0;
struct l2tp_session *session;
- session = l2tp_nl_session_find(info);
+ session = l2tp_nl_session_get(info, false);
if (session == NULL) {
ret = -ENODEV;
goto out;
@@ -708,6 +724,8 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
ret = l2tp_session_notify(&l2tp_nl_family, info,
session, L2TP_CMD_SESSION_MODIFY);
+ l2tp_session_dec_refcount(session);
+
out:
return ret;
}
@@ -803,29 +821,34 @@ static int l2tp_nl_cmd_session_get(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *msg;
int ret;
- session = l2tp_nl_session_find(info);
+ session = l2tp_nl_session_get(info, false);
if (session == NULL) {
ret = -ENODEV;
- goto out;
+ goto err;
}
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg) {
ret = -ENOMEM;
- goto out;
+ goto err_ref;
}
ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq,
0, session, L2TP_CMD_SESSION_GET);
if (ret < 0)
- goto err_out;
+ goto err_ref_msg;
- return genlmsg_unicast(genl_info_net(info), msg, info->snd_portid);
+ ret = genlmsg_unicast(genl_info_net(info), msg, info->snd_portid);
-err_out:
- nlmsg_free(msg);
+ l2tp_session_dec_refcount(session);
-out:
+ return ret;
+
+err_ref_msg:
+ nlmsg_free(msg);
+err_ref:
+ l2tp_session_dec_refcount(session);
+err:
return ret;
}
@@ -844,7 +867,7 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
goto out;
}
- session = l2tp_session_find_nth(tunnel, si);
+ session = l2tp_session_get_nth(tunnel, si, false);
if (session == NULL) {
ti++;
tunnel = NULL;
@@ -854,8 +877,11 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
if (l2tp_nl_session_send(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- session, L2TP_CMD_SESSION_GET) < 0)
+ session, L2TP_CMD_SESSION_GET) < 0) {
+ l2tp_session_dec_refcount(session);
break;
+ }
+ l2tp_session_dec_refcount(session);
si++;
}
@@ -977,6 +1003,19 @@ static const struct genl_ops l2tp_nl_ops[] = {
},
};
+static struct genl_family l2tp_nl_family __ro_after_init = {
+ .name = L2TP_GENL_NAME,
+ .version = L2TP_GENL_VERSION,
+ .hdrsize = 0,
+ .maxattr = L2TP_ATTR_MAX,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = l2tp_nl_ops,
+ .n_ops = ARRAY_SIZE(l2tp_nl_ops),
+ .mcgrps = l2tp_multicast_group,
+ .n_mcgrps = ARRAY_SIZE(l2tp_multicast_group),
+};
+
int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops)
{
int ret;
@@ -1010,12 +1049,10 @@ void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type)
}
EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops);
-static int l2tp_nl_init(void)
+static int __init l2tp_nl_init(void)
{
pr_info("L2TP netlink interface\n");
- return genl_register_family_with_ops_groups(&l2tp_nl_family,
- l2tp_nl_ops,
- l2tp_multicast_group);
+ return genl_register_family(&l2tp_nl_family);
}
static void l2tp_nl_cleanup(void)
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 41d47bfda15c..32ea0f3d868c 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -231,14 +231,14 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
if (sk->sk_state & PPPOX_BOUND) {
struct pppox_sock *po;
- l2tp_dbg(session, PPPOL2TP_MSG_DATA,
+ l2tp_dbg(session, L2TP_MSG_DATA,
"%s: recv %d byte data frame, passing to ppp\n",
session->name, data_len);
po = pppox_sk(sk);
ppp_input(&po->chan, skb);
} else {
- l2tp_dbg(session, PPPOL2TP_MSG_DATA,
+ l2tp_dbg(session, L2TP_MSG_DATA,
"%s: recv %d byte data frame, passing to L2TP socket\n",
session->name, data_len);
@@ -251,7 +251,7 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
return;
no_sock:
- l2tp_info(session, PPPOL2TP_MSG_DATA, "%s: no socket\n", session->name);
+ l2tp_info(session, L2TP_MSG_DATA, "%s: no socket\n", session->name);
kfree_skb(skb);
}
@@ -450,6 +450,10 @@ static void pppol2tp_session_close(struct l2tp_session *session)
static void pppol2tp_session_destruct(struct sock *sk)
{
struct l2tp_session *session = sk->sk_user_data;
+
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge(&sk->sk_write_queue);
+
if (session) {
sk->sk_user_data = NULL;
BUG_ON(session->magic != L2TP_SESSION_MAGIC);
@@ -488,9 +492,6 @@ static int pppol2tp_release(struct socket *sock)
l2tp_session_queue_purge(session);
sock_put(sk);
}
- skb_queue_purge(&sk->sk_receive_queue);
- skb_queue_purge(&sk->sk_write_queue);
-
release_sock(sk);
/* This will delete the session context via
@@ -582,6 +583,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
int error = 0;
u32 tunnel_id, peer_tunnel_id;
u32 session_id, peer_session_id;
+ bool drop_refcnt = false;
int ver = 2;
int fd;
@@ -683,36 +685,36 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
if (tunnel->peer_tunnel_id == 0)
tunnel->peer_tunnel_id = peer_tunnel_id;
- /* Create session if it doesn't already exist. We handle the
- * case where a session was previously created by the netlink
- * interface by checking that the session doesn't already have
- * a socket and its tunnel socket are what we expect. If any
- * of those checks fail, return EEXIST to the caller.
- */
- session = l2tp_session_find(sock_net(sk), tunnel, session_id);
- if (session == NULL) {
- /* Default MTU must allow space for UDP/L2TP/PPP
- * headers.
+ session = l2tp_session_get(sock_net(sk), tunnel, session_id, false);
+ if (session) {
+ drop_refcnt = true;
+ ps = l2tp_session_priv(session);
+
+ /* Using a pre-existing session is fine as long as it hasn't
+ * been connected yet.
*/
- cfg.mtu = cfg.mru = 1500 - PPPOL2TP_HEADER_OVERHEAD;
+ if (ps->sock) {
+ error = -EEXIST;
+ goto end;
+ }
- /* Allocate and initialize a new session context. */
- session = l2tp_session_create(sizeof(struct pppol2tp_session),
- tunnel, session_id,
- peer_session_id, &cfg);
- if (session == NULL) {
- error = -ENOMEM;
+ /* consistency checks */
+ if (ps->tunnel_sock != tunnel->sock) {
+ error = -EEXIST;
goto end;
}
} else {
- ps = l2tp_session_priv(session);
- error = -EEXIST;
- if (ps->sock != NULL)
- goto end;
+ /* Default MTU must allow space for UDP/L2TP/PPP headers */
+ cfg.mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
+ cfg.mru = cfg.mtu;
- /* consistency checks */
- if (ps->tunnel_sock != tunnel->sock)
+ session = l2tp_session_create(sizeof(struct pppol2tp_session),
+ tunnel, session_id,
+ peer_session_id, &cfg);
+ if (IS_ERR(session)) {
+ error = PTR_ERR(session);
goto end;
+ }
}
/* Associate session with its PPPoL2TP socket */
@@ -773,10 +775,12 @@ out_no_ppp:
/* This is how we get the session context from the socket. */
sk->sk_user_data = session;
sk->sk_state = PPPOX_CONNECTED;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: created\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: created\n",
session->name);
end:
+ if (drop_refcnt)
+ l2tp_session_dec_refcount(session);
release_sock(sk);
return error;
@@ -804,12 +808,6 @@ static int pppol2tp_session_create(struct net *net, u32 tunnel_id, u32 session_i
if (tunnel->sock == NULL)
goto out;
- /* Check that this session doesn't already exist */
- error = -EEXIST;
- session = l2tp_session_find(net, tunnel, session_id);
- if (session != NULL)
- goto out;
-
/* Default MTU values. */
if (cfg->mtu == 0)
cfg->mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
@@ -817,17 +815,18 @@ static int pppol2tp_session_create(struct net *net, u32 tunnel_id, u32 session_i
cfg->mru = cfg->mtu;
/* Allocate and initialize a new session context. */
- error = -ENOMEM;
session = l2tp_session_create(sizeof(struct pppol2tp_session),
tunnel, session_id,
peer_session_id, cfg);
- if (session == NULL)
+ if (IS_ERR(session)) {
+ error = PTR_ERR(session);
goto out;
+ }
ps = l2tp_session_priv(session);
ps->tunnel_sock = tunnel->sock;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: created\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: created\n",
session->name);
error = 0;
@@ -989,7 +988,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
struct l2tp_tunnel *tunnel = session->tunnel;
struct pppol2tp_ioc_stats stats;
- l2tp_dbg(session, PPPOL2TP_MSG_CONTROL,
+ l2tp_dbg(session, L2TP_MSG_CONTROL,
"%s: pppol2tp_session_ioctl(cmd=%#x, arg=%#lx)\n",
session->name, cmd, arg);
@@ -1009,7 +1008,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
if (copy_to_user((void __user *) arg, &ifr, sizeof(struct ifreq)))
break;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get mtu=%d\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: get mtu=%d\n",
session->name, session->mtu);
err = 0;
break;
@@ -1025,7 +1024,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
session->mtu = ifr.ifr_mtu;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set mtu=%d\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: set mtu=%d\n",
session->name, session->mtu);
err = 0;
break;
@@ -1039,7 +1038,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
if (put_user(session->mru, (int __user *) arg))
break;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get mru=%d\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: get mru=%d\n",
session->name, session->mru);
err = 0;
break;
@@ -1054,7 +1053,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
break;
session->mru = val;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set mru=%d\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: set mru=%d\n",
session->name, session->mru);
err = 0;
break;
@@ -1064,7 +1063,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
if (put_user(ps->flags, (int __user *) arg))
break;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get flags=%d\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: get flags=%d\n",
session->name, ps->flags);
err = 0;
break;
@@ -1074,7 +1073,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
if (get_user(val, (int __user *) arg))
break;
ps->flags = val;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set flags=%d\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: set flags=%d\n",
session->name, ps->flags);
err = 0;
break;
@@ -1091,7 +1090,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
if (copy_to_user((void __user *) arg, &stats,
sizeof(stats)))
break;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get L2TP stats\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: get L2TP stats\n",
session->name);
err = 0;
break;
@@ -1119,7 +1118,7 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel,
struct sock *sk;
struct pppol2tp_ioc_stats stats;
- l2tp_dbg(tunnel, PPPOL2TP_MSG_CONTROL,
+ l2tp_dbg(tunnel, L2TP_MSG_CONTROL,
"%s: pppol2tp_tunnel_ioctl(cmd=%#x, arg=%#lx)\n",
tunnel->name, cmd, arg);
@@ -1140,11 +1139,18 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel,
if (stats.session_id != 0) {
/* resend to session ioctl handler */
struct l2tp_session *session =
- l2tp_session_find(sock_net(sk), tunnel, stats.session_id);
- if (session != NULL)
- err = pppol2tp_session_ioctl(session, cmd, arg);
- else
+ l2tp_session_get(sock_net(sk), tunnel,
+ stats.session_id, true);
+
+ if (session) {
+ err = pppol2tp_session_ioctl(session, cmd,
+ arg);
+ if (session->deref)
+ session->deref(session);
+ l2tp_session_dec_refcount(session);
+ } else {
err = -EBADR;
+ }
break;
}
#ifdef CONFIG_XFRM
@@ -1155,7 +1161,7 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel,
err = -EFAULT;
break;
}
- l2tp_info(tunnel, PPPOL2TP_MSG_CONTROL, "%s: get L2TP stats\n",
+ l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: get L2TP stats\n",
tunnel->name);
err = 0;
break;
@@ -1245,7 +1251,7 @@ static int pppol2tp_tunnel_setsockopt(struct sock *sk,
switch (optname) {
case PPPOL2TP_SO_DEBUG:
tunnel->debug = val;
- l2tp_info(tunnel, PPPOL2TP_MSG_CONTROL, "%s: set debug=%x\n",
+ l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: set debug=%x\n",
tunnel->name, tunnel->debug);
break;
@@ -1272,8 +1278,8 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
err = -EINVAL;
break;
}
- session->recv_seq = val ? -1 : 0;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ session->recv_seq = !!val;
+ l2tp_info(session, L2TP_MSG_CONTROL,
"%s: set recv_seq=%d\n",
session->name, session->recv_seq);
break;
@@ -1283,7 +1289,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
err = -EINVAL;
break;
}
- session->send_seq = val ? -1 : 0;
+ session->send_seq = !!val;
{
struct sock *ssk = ps->sock;
struct pppox_sock *po = pppox_sk(ssk);
@@ -1291,7 +1297,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
PPPOL2TP_L2TP_HDR_SIZE_NOSEQ;
}
l2tp_session_set_header_len(session, session->tunnel->version);
- l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ l2tp_info(session, L2TP_MSG_CONTROL,
"%s: set send_seq=%d\n",
session->name, session->send_seq);
break;
@@ -1301,21 +1307,21 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
err = -EINVAL;
break;
}
- session->lns_mode = val ? -1 : 0;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ session->lns_mode = !!val;
+ l2tp_info(session, L2TP_MSG_CONTROL,
"%s: set lns_mode=%d\n",
session->name, session->lns_mode);
break;
case PPPOL2TP_SO_DEBUG:
session->debug = val;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set debug=%x\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: set debug=%x\n",
session->name, session->debug);
break;
case PPPOL2TP_SO_REORDERTO:
session->reorder_timeout = msecs_to_jiffies(val);
- l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ l2tp_info(session, L2TP_MSG_CONTROL,
"%s: set reorder_timeout=%d\n",
session->name, session->reorder_timeout);
break;
@@ -1377,8 +1383,6 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname,
} else
err = pppol2tp_session_setsockopt(sk, session, optname, val);
- err = 0;
-
end_put_sess:
sock_put(sk);
end:
@@ -1396,7 +1400,7 @@ static int pppol2tp_tunnel_getsockopt(struct sock *sk,
switch (optname) {
case PPPOL2TP_SO_DEBUG:
*val = tunnel->debug;
- l2tp_info(tunnel, PPPOL2TP_MSG_CONTROL, "%s: get debug=%x\n",
+ l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: get debug=%x\n",
tunnel->name, tunnel->debug);
break;
@@ -1419,31 +1423,31 @@ static int pppol2tp_session_getsockopt(struct sock *sk,
switch (optname) {
case PPPOL2TP_SO_RECVSEQ:
*val = session->recv_seq;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ l2tp_info(session, L2TP_MSG_CONTROL,
"%s: get recv_seq=%d\n", session->name, *val);
break;
case PPPOL2TP_SO_SENDSEQ:
*val = session->send_seq;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ l2tp_info(session, L2TP_MSG_CONTROL,
"%s: get send_seq=%d\n", session->name, *val);
break;
case PPPOL2TP_SO_LNSMODE:
*val = session->lns_mode;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ l2tp_info(session, L2TP_MSG_CONTROL,
"%s: get lns_mode=%d\n", session->name, *val);
break;
case PPPOL2TP_SO_DEBUG:
*val = session->debug;
- l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get debug=%d\n",
+ l2tp_info(session, L2TP_MSG_CONTROL, "%s: get debug=%d\n",
session->name, *val);
break;
case PPPOL2TP_SO_REORDERTO:
*val = (int) jiffies_to_msecs(session->reorder_timeout);
- l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ l2tp_info(session, L2TP_MSG_CONTROL,
"%s: get reorder_timeout=%d\n", session->name, *val);
break;
@@ -1501,8 +1505,13 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname,
err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val);
sock_put(ps->tunnel_sock);
- } else
+ if (err)
+ goto end_put_sess;
+ } else {
err = pppol2tp_session_getsockopt(sk, session, optname, &val);
+ if (err)
+ goto end_put_sess;
+ }
err = -EFAULT;
if (put_user(len, optlen))
@@ -1554,7 +1563,7 @@ static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd)
static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd)
{
- pd->session = l2tp_session_find_nth(pd->tunnel, pd->session_idx);
+ pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx, true);
pd->session_idx++;
if (pd->session == NULL) {
@@ -1681,10 +1690,14 @@ static int pppol2tp_seq_show(struct seq_file *m, void *v)
/* Show the tunnel or session context.
*/
- if (pd->session == NULL)
+ if (!pd->session) {
pppol2tp_seq_tunnel_show(m, pd->tunnel);
- else
+ } else {
pppol2tp_seq_session_show(m, pd->session);
+ if (pd->session->deref)
+ pd->session->deref(pd->session);
+ l2tp_session_dec_refcount(pd->session);
+ }
out:
return 0;
@@ -1843,4 +1856,4 @@ MODULE_DESCRIPTION("PPP over L2TP over UDP");
MODULE_LICENSE("GPL");
MODULE_VERSION(PPPOL2TP_DRV_VERSION);
MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_OL2TP);
-MODULE_ALIAS_L2TP_PWTYPE(11);
+MODULE_ALIAS_L2TP_PWTYPE(7);
diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c
index fc60d9d738b5..b50b64ac8815 100644
--- a/net/lapb/lapb_iface.c
+++ b/net/lapb/lapb_iface.c
@@ -33,7 +33,7 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/lapb/lapb_in.c b/net/lapb/lapb_in.c
index 182470847fcf..d5d2110eb717 100644
--- a/net/lapb/lapb_in.c
+++ b/net/lapb/lapb_in.c
@@ -31,7 +31,7 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/lapb/lapb_out.c b/net/lapb/lapb_out.c
index 482c94d9d958..eda726e22f64 100644
--- a/net/lapb/lapb_out.c
+++ b/net/lapb/lapb_out.c
@@ -29,7 +29,7 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c
index 3c1914df641f..75efde3e616c 100644
--- a/net/lapb/lapb_subr.c
+++ b/net/lapb/lapb_subr.c
@@ -28,7 +28,7 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c
index 355cc3b6fa4d..1a5535bc3b8d 100644
--- a/net/lapb/lapb_timer.c
+++ b/net/lapb/lapb_timer.c
@@ -29,7 +29,7 @@
#include <linux/inet.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index db916cf51ffe..cb4fff785cbf 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -26,6 +26,8 @@
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/sched/signal.h>
+
#include <net/llc.h>
#include <net/llc_sap.h>
#include <net/llc_pdu.h>
@@ -532,12 +534,12 @@ out:
static int llc_ui_wait_for_disc(struct sock *sk, long timeout)
{
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
int rc = 0;
+ add_wait_queue(sk_sleep(sk), &wait);
while (1) {
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- if (sk_wait_event(sk, &timeout, sk->sk_state == TCP_CLOSE))
+ if (sk_wait_event(sk, &timeout, sk->sk_state == TCP_CLOSE, &wait))
break;
rc = -ERESTARTSYS;
if (signal_pending(current))
@@ -547,39 +549,39 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout)
break;
rc = 0;
}
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
return rc;
}
static bool llc_ui_wait_for_conn(struct sock *sk, long timeout)
{
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ add_wait_queue(sk_sleep(sk), &wait);
while (1) {
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- if (sk_wait_event(sk, &timeout, sk->sk_state != TCP_SYN_SENT))
+ if (sk_wait_event(sk, &timeout, sk->sk_state != TCP_SYN_SENT, &wait))
break;
if (signal_pending(current) || !timeout)
break;
}
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
return timeout;
}
static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout)
{
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct llc_sock *llc = llc_sk(sk);
int rc;
+ add_wait_queue(sk_sleep(sk), &wait);
while (1) {
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
rc = 0;
if (sk_wait_event(sk, &timeout,
(sk->sk_shutdown & RCV_SHUTDOWN) ||
(!llc_data_accept_state(llc->state) &&
!llc->remote_busy_flag &&
- !llc->p_flag)))
+ !llc->p_flag), &wait))
break;
rc = -ERESTARTSYS;
if (signal_pending(current))
@@ -588,7 +590,7 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout)
if (!timeout)
break;
}
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
return rc;
}
@@ -639,11 +641,13 @@ static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb)
* @sock: Socket which connections arrive on.
* @newsock: Socket to move incoming connection to.
* @flags: User specified operational flags.
+ * @kern: If the socket is kernel internal
*
* Accept a new incoming connection.
* Returns 0 upon success, negative otherwise.
*/
-static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags)
+static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern)
{
struct sock *sk = sock->sk, *newsk;
struct llc_sock *llc, *newllc;
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 3e821daf9dd4..8bc5a1bd2d45 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -821,7 +821,10 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb)
* another trick required to cope with how the PROCOM state
* machine works. -acme
*/
+ skb_orphan(skb);
+ sock_hold(sk);
skb->sk = sk;
+ skb->destructor = sock_efree;
}
if (!sock_owned_by_user(sk))
llc_conn_rcv(sk, skb);
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index d0e1e804ebd7..5404d0d195cc 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -290,7 +290,10 @@ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb,
ev->type = LLC_SAP_EV_TYPE_PDU;
ev->reason = 0;
+ skb_orphan(skb);
+ sock_hold(sk);
skb->sk = sk;
+ skb->destructor = sock_efree;
llc_sap_state_process(sap, skb);
}
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index 3891cbd2adea..76e30f4797fb 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -6,6 +6,7 @@ config MAC80211
select CRYPTO_AES
select CRYPTO_CCM
select CRYPTO_GCM
+ select CRYPTO_CMAC
select CRC32
---help---
This option enables the hardware independent IEEE 802.11
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index f9137a8341f4..282912245938 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -19,6 +19,7 @@ mac80211-y := \
aes_gcm.o \
aes_cmac.o \
aes_gmac.o \
+ fils_aead.o \
cfg.o \
ethtool.o \
rx.o \
@@ -60,4 +61,4 @@ rc80211_minstrel_ht-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_ht_debugfs.o
mac80211-$(CONFIG_MAC80211_RC_MINSTREL) += $(rc80211_minstrel-y)
mac80211-$(CONFIG_MAC80211_RC_MINSTREL_HT) += $(rc80211_minstrel_ht-y)
-ccflags-y += -D__CHECK_ENDIAN__ -DDEBUG
+ccflags-y += -DDEBUG
diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c
index bdf0790d89cc..2fb65588490c 100644
--- a/net/mac80211/aes_cmac.c
+++ b/net/mac80211/aes_cmac.c
@@ -22,126 +22,50 @@
#define CMAC_TLEN_256 16 /* CMAC TLen = 128 bits (16 octets) */
#define AAD_LEN 20
+static const u8 zero[CMAC_TLEN_256];
-static void gf_mulx(u8 *pad)
-{
- int i, carry;
-
- carry = pad[0] & 0x80;
- for (i = 0; i < AES_BLOCK_SIZE - 1; i++)
- pad[i] = (pad[i] << 1) | (pad[i + 1] >> 7);
- pad[AES_BLOCK_SIZE - 1] <<= 1;
- if (carry)
- pad[AES_BLOCK_SIZE - 1] ^= 0x87;
-}
-
-static void aes_cmac_vector(struct crypto_cipher *tfm, size_t num_elem,
- const u8 *addr[], const size_t *len, u8 *mac,
- size_t mac_len)
-{
- u8 cbc[AES_BLOCK_SIZE], pad[AES_BLOCK_SIZE];
- const u8 *pos, *end;
- size_t i, e, left, total_len;
-
- memset(cbc, 0, AES_BLOCK_SIZE);
-
- total_len = 0;
- for (e = 0; e < num_elem; e++)
- total_len += len[e];
- left = total_len;
-
- e = 0;
- pos = addr[0];
- end = pos + len[0];
-
- while (left >= AES_BLOCK_SIZE) {
- for (i = 0; i < AES_BLOCK_SIZE; i++) {
- cbc[i] ^= *pos++;
- if (pos >= end) {
- e++;
- pos = addr[e];
- end = pos + len[e];
- }
- }
- if (left > AES_BLOCK_SIZE)
- crypto_cipher_encrypt_one(tfm, cbc, cbc);
- left -= AES_BLOCK_SIZE;
- }
-
- memset(pad, 0, AES_BLOCK_SIZE);
- crypto_cipher_encrypt_one(tfm, pad, pad);
- gf_mulx(pad);
-
- if (left || total_len == 0) {
- for (i = 0; i < left; i++) {
- cbc[i] ^= *pos++;
- if (pos >= end) {
- e++;
- pos = addr[e];
- end = pos + len[e];
- }
- }
- cbc[left] ^= 0x80;
- gf_mulx(pad);
- }
-
- for (i = 0; i < AES_BLOCK_SIZE; i++)
- pad[i] ^= cbc[i];
- crypto_cipher_encrypt_one(tfm, pad, pad);
- memcpy(mac, pad, mac_len);
-}
-
-
-void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad,
+void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad,
const u8 *data, size_t data_len, u8 *mic)
{
- const u8 *addr[3];
- size_t len[3];
- u8 zero[CMAC_TLEN];
+ SHASH_DESC_ON_STACK(desc, tfm);
+ u8 out[AES_BLOCK_SIZE];
- memset(zero, 0, CMAC_TLEN);
- addr[0] = aad;
- len[0] = AAD_LEN;
- addr[1] = data;
- len[1] = data_len - CMAC_TLEN;
- addr[2] = zero;
- len[2] = CMAC_TLEN;
+ desc->tfm = tfm;
- aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN);
+ crypto_shash_init(desc);
+ crypto_shash_update(desc, aad, AAD_LEN);
+ crypto_shash_update(desc, data, data_len - CMAC_TLEN);
+ crypto_shash_finup(desc, zero, CMAC_TLEN, out);
+
+ memcpy(mic, out, CMAC_TLEN);
}
-void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad,
+void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad,
const u8 *data, size_t data_len, u8 *mic)
{
- const u8 *addr[3];
- size_t len[3];
- u8 zero[CMAC_TLEN_256];
+ SHASH_DESC_ON_STACK(desc, tfm);
- memset(zero, 0, CMAC_TLEN_256);
- addr[0] = aad;
- len[0] = AAD_LEN;
- addr[1] = data;
- len[1] = data_len - CMAC_TLEN_256;
- addr[2] = zero;
- len[2] = CMAC_TLEN_256;
+ desc->tfm = tfm;
- aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN_256);
+ crypto_shash_init(desc);
+ crypto_shash_update(desc, aad, AAD_LEN);
+ crypto_shash_update(desc, data, data_len - CMAC_TLEN_256);
+ crypto_shash_finup(desc, zero, CMAC_TLEN_256, mic);
}
-struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[],
- size_t key_len)
+struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[],
+ size_t key_len)
{
- struct crypto_cipher *tfm;
+ struct crypto_shash *tfm;
- tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+ tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
if (!IS_ERR(tfm))
- crypto_cipher_setkey(tfm, key, key_len);
+ crypto_shash_setkey(tfm, key, key_len);
return tfm;
}
-
-void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm)
+void ieee80211_aes_cmac_key_free(struct crypto_shash *tfm)
{
- crypto_free_cipher(tfm);
+ crypto_free_shash(tfm);
}
diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h
index 3702041f44fd..fef531f42003 100644
--- a/net/mac80211/aes_cmac.h
+++ b/net/mac80211/aes_cmac.h
@@ -10,13 +10,14 @@
#define AES_CMAC_H
#include <linux/crypto.h>
+#include <crypto/hash.h>
-struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[],
- size_t key_len);
-void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad,
+struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[],
+ size_t key_len);
+void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad,
const u8 *data, size_t data_len, u8 *mic);
-void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad,
+void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad,
const u8 *data, size_t data_len, u8 *mic);
-void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm);
+void ieee80211_aes_cmac_key_free(struct crypto_shash *tfm);
#endif /* AES_CMAC_H */
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index f6749dced021..4456559cb056 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -85,7 +85,7 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
ht_dbg(sta->sdata,
"Rx BA session stop requested for %pM tid %u %s reason: %d\n",
sta->sta.addr, tid,
- initiator == WLAN_BACK_RECIPIENT ? "recipient" : "inititator",
+ initiator == WLAN_BACK_RECIPIENT ? "recipient" : "initiator",
(int)reason);
if (drv_ampdu_action(local, sta->sdata, &params))
@@ -315,11 +315,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
mutex_lock(&sta->ampdu_mlme.mtx);
if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) {
- tid_agg_rx = rcu_dereference_protected(
- sta->ampdu_mlme.tid_rx[tid],
- lockdep_is_held(&sta->ampdu_mlme.mtx));
-
- if (tid_agg_rx->dialog_token == dialog_token) {
+ if (sta->ampdu_mlme.tid_rx_token[tid] == dialog_token) {
ht_dbg_ratelimited(sta->sdata,
"updated AddBA Req from %pM on tid %u\n",
sta->sta.addr, tid);
@@ -396,13 +392,13 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
}
/* update data */
- tid_agg_rx->dialog_token = dialog_token;
tid_agg_rx->ssn = start_seq_num;
tid_agg_rx->head_seq_num = start_seq_num;
tid_agg_rx->buf_size = buf_size;
tid_agg_rx->timeout = timeout;
tid_agg_rx->stored_mpdu_num = 0;
tid_agg_rx->auto_seq = auto_seq;
+ tid_agg_rx->started = false;
tid_agg_rx->reorder_buf_filtered = 0;
status = WLAN_STATUS_SUCCESS;
@@ -418,6 +414,7 @@ end:
if (status == WLAN_STATUS_SUCCESS) {
__set_bit(tid, sta->ampdu_mlme.agg_session_valid);
__clear_bit(tid, sta->ampdu_mlme.unexpected_agg);
+ sta->ampdu_mlme.tid_rx_token[tid] = dialog_token;
}
mutex_unlock(&sta->ampdu_mlme.mtx);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index fd6541f3ade3..ac879bb17870 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -208,8 +208,8 @@ static int ieee80211_nan_change_conf(struct wiphy *wiphy,
if (changes & CFG80211_NAN_CONF_CHANGED_PREF)
new_conf.master_pref = conf->master_pref;
- if (changes & CFG80211_NAN_CONF_CHANGED_DUAL)
- new_conf.dual = conf->dual;
+ if (changes & CFG80211_NAN_CONF_CHANGED_BANDS)
+ new_conf.bands = conf->bands;
ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes);
if (!ret)
@@ -357,10 +357,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
mutex_lock(&local->sta_mtx);
if (mac_addr) {
- if (ieee80211_vif_is_mesh(&sdata->vif))
- sta = sta_info_get(sdata, mac_addr);
- else
- sta = sta_info_get_bss(sdata, mac_addr);
+ sta = sta_info_get_bss(sdata, mac_addr);
/*
* The ASSOC test makes sure the driver is ready to
* receive the key. When wpa_supplicant has roamed
@@ -867,6 +864,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
}
sdata->needed_rx_chains = sdata->local->rx_chains;
+ sdata->vif.bss_conf.beacon_int = params->beacon_interval;
+
mutex_lock(&local->mtx);
err = ieee80211_vif_use_channel(sdata, &params->chandef,
IEEE80211_CHANCTX_SHARED);
@@ -897,7 +896,6 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
vlan->vif.type);
}
- sdata->vif.bss_conf.beacon_int = params->beacon_interval;
sdata->vif.bss_conf.dtim_period = params->dtim_period;
sdata->vif.bss_conf.enable_beacon = true;
sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p;
@@ -1523,9 +1521,6 @@ static int ieee80211_change_station(struct wiphy *wiphy,
goto out_err;
if (params->vlan && params->vlan != sta->sdata->dev) {
- bool prev_4addr = false;
- bool new_4addr = false;
-
vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan);
if (params->vlan->ieee80211_ptr->use_4addr) {
@@ -1535,26 +1530,21 @@ static int ieee80211_change_station(struct wiphy *wiphy,
}
rcu_assign_pointer(vlansdata->u.vlan.sta, sta);
- new_4addr = true;
__ieee80211_check_fast_rx_iface(vlansdata);
}
if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
- sta->sdata->u.vlan.sta) {
+ sta->sdata->u.vlan.sta)
RCU_INIT_POINTER(sta->sdata->u.vlan.sta, NULL);
- prev_4addr = true;
- }
+
+ if (test_sta_flag(sta, WLAN_STA_AUTHORIZED))
+ ieee80211_vif_dec_num_mcast(sta->sdata);
sta->sdata = vlansdata;
ieee80211_check_fast_xmit(sta);
- if (sta->sta_state == IEEE80211_STA_AUTHORIZED &&
- prev_4addr != new_4addr) {
- if (new_4addr)
- atomic_dec(&sta->sdata->bss->num_mcast_sta);
- else
- atomic_inc(&sta->sdata->bss->num_mcast_sta);
- }
+ if (test_sta_flag(sta, WLAN_STA_AUTHORIZED))
+ ieee80211_vif_inc_num_mcast(sta->sdata);
ieee80211_send_layer2_update(sta);
}
@@ -2480,13 +2470,6 @@ int __ieee80211_request_smps_ap(struct ieee80211_sub_if_data *sdata,
smps_mode == IEEE80211_SMPS_AUTOMATIC)
return 0;
- /* If no associated stations, there's no need to do anything */
- if (!atomic_read(&sdata->u.ap.num_mcast_sta)) {
- sdata->smps_mode = smps_mode;
- ieee80211_queue_work(&sdata->local->hw, &sdata->recalc_smps);
- return 0;
- }
-
ht_dbg(sdata,
"SMPS %d requested in AP mode, sending Action frame to %d stations\n",
smps_mode, atomic_read(&sdata->u.ap.num_mcast_sta));
@@ -3580,6 +3563,17 @@ void ieee80211_nan_func_match(struct ieee80211_vif *vif,
}
EXPORT_SYMBOL(ieee80211_nan_func_match);
+static int ieee80211_set_multicast_to_unicast(struct wiphy *wiphy,
+ struct net_device *dev,
+ const bool enabled)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ sdata->u.ap.multicast_to_unicast = enabled;
+
+ return 0;
+}
+
const struct cfg80211_ops mac80211_config_ops = {
.add_virtual_intf = ieee80211_add_iface,
.del_virtual_intf = ieee80211_del_iface,
@@ -3670,4 +3664,5 @@ const struct cfg80211_ops mac80211_config_ops = {
.nan_change_conf = ieee80211_nan_change_conf,
.add_nan_func = ieee80211_add_nan_func,
.del_nan_func = ieee80211_del_nan_func,
+ .set_multicast_to_unicast = ieee80211_set_multicast_to_unicast,
};
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index e75cbf6ecc26..89178b46b32f 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -231,9 +231,6 @@ ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata)
!(sta->sdata->bss && sta->sdata->bss == sdata->bss))
continue;
- if (!sta->uploaded || !test_sta_flag(sta, WLAN_STA_ASSOC))
- continue;
-
max_bw = max(max_bw, ieee80211_get_sta_bw(&sta->sta));
}
rcu_read_unlock();
@@ -1270,7 +1267,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
struct ieee80211_sub_if_data *sdata, *sdata_tmp;
struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx;
struct ieee80211_chanctx *new_ctx = NULL;
- int i, err, n_assigned, n_reserved, n_ready;
+ int err, n_assigned, n_reserved, n_ready;
int n_ctx = 0, n_vifs_switch = 0, n_vifs_assign = 0, n_vifs_ctxless = 0;
lockdep_assert_held(&local->mtx);
@@ -1391,8 +1388,6 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
* Update all structures, values and pointers to point to new channel
* context(s).
*/
-
- i = 0;
list_for_each_entry(ctx, &local->chanctx_list, list) {
if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
continue;
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index f56e2f487d09..5fae001f286c 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -210,6 +210,7 @@ static const char *hw_flag_names[] = {
FLAG(TX_AMSDU),
FLAG(TX_FRAG_LIST),
FLAG(REPORTS_LOW_ACK),
+ FLAG(SUPPORTS_TX_FRAG),
#undef FLAG
};
@@ -242,6 +243,38 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf,
return rv;
}
+static ssize_t misc_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ /* Max len of each line is 16 characters, plus 9 for 'pending:\n' */
+ size_t bufsz = IEEE80211_MAX_QUEUES * 16 + 9;
+ char *buf;
+ char *pos, *end;
+ ssize_t rv;
+ int i;
+ int ln;
+
+ buf = kzalloc(bufsz, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ pos = buf;
+ end = buf + bufsz - 1;
+
+ pos += scnprintf(pos, end - pos, "pending:\n");
+
+ for (i = 0; i < IEEE80211_MAX_QUEUES; i++) {
+ ln = skb_queue_len(&local->pending[i]);
+ pos += scnprintf(pos, end - pos, "[%i] %d\n",
+ i, ln);
+ }
+
+ rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
+ kfree(buf);
+ return rv;
+}
+
static ssize_t queues_read(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
@@ -262,6 +295,7 @@ static ssize_t queues_read(struct file *file, char __user *user_buf,
DEBUGFS_READONLY_FILE_OPS(hwflags);
DEBUGFS_READONLY_FILE_OPS(queues);
+DEBUGFS_READONLY_FILE_OPS(misc);
/* statistics stuff */
@@ -329,7 +363,9 @@ void debugfs_hw_add(struct ieee80211_local *local)
DEBUGFS_ADD(total_ps_buffered);
DEBUGFS_ADD(wep_iv);
+ DEBUGFS_ADD(rate_ctrl_alg);
DEBUGFS_ADD(queues);
+ DEBUGFS_ADD(misc);
#ifdef CONFIG_PM
DEBUGFS_ADD_MODE(reset, 0200);
#endif
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index bcec1240f41d..8f5fff8b2040 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -477,6 +477,7 @@ IEEE80211_IF_FILE_RW(tdls_wider_bw);
IEEE80211_IF_FILE(num_mcast_sta, u.ap.num_mcast_sta, ATOMIC);
IEEE80211_IF_FILE(num_sta_ps, u.ap.ps.num_sta_ps, ATOMIC);
IEEE80211_IF_FILE(dtim_count, u.ap.ps.dtim_count, DEC);
+IEEE80211_IF_FILE(num_mcast_sta_vlan, u.vlan.num_mcast_sta, ATOMIC);
static ssize_t ieee80211_if_fmt_num_buffered_multicast(
const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
@@ -518,6 +519,8 @@ static ssize_t ieee80211_if_fmt_aqm(
}
IEEE80211_IF_FILE_R(aqm);
+IEEE80211_IF_FILE(multicast_to_unicast, u.ap.multicast_to_unicast, HEX);
+
/* IBSS attributes */
static ssize_t ieee80211_if_fmt_tsf(
const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
@@ -682,6 +685,14 @@ static void add_ap_files(struct ieee80211_sub_if_data *sdata)
DEBUGFS_ADD(dtim_count);
DEBUGFS_ADD(num_buffered_multicast);
DEBUGFS_ADD_MODE(tkip_mic_test, 0200);
+ DEBUGFS_ADD_MODE(multicast_to_unicast, 0600);
+}
+
+static void add_vlan_files(struct ieee80211_sub_if_data *sdata)
+{
+ /* add num_mcast_sta_vlan using name num_mcast_sta */
+ debugfs_create_file("num_mcast_sta", 0400, sdata->vif.debugfs_dir,
+ sdata, &num_mcast_sta_vlan_ops);
}
static void add_ibss_files(struct ieee80211_sub_if_data *sdata)
@@ -787,6 +798,9 @@ static void add_files(struct ieee80211_sub_if_data *sdata)
case NL80211_IFTYPE_AP:
add_ap_files(sdata);
break;
+ case NL80211_IFTYPE_AP_VLAN:
+ add_vlan_files(sdata);
+ break;
case NL80211_IFTYPE_WDS:
add_wds_files(sdata);
break;
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index a2fcdb47a0e6..42601820db20 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -199,13 +199,18 @@ static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf,
"TID\t\tRX\tDTKN\tSSN\t\tTX\tDTKN\tpending\n");
for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
+ bool tid_rx_valid;
+
tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[i]);
tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[i]);
+ tid_rx_valid = test_bit(i, sta->ampdu_mlme.agg_session_valid);
p += scnprintf(p, sizeof(buf) + buf - p, "%02d", i);
- p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x", !!tid_rx);
+ p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x",
+ tid_rx_valid);
p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.2x",
- tid_rx ? tid_rx->dialog_token : 0);
+ tid_rx_valid ?
+ sta->ampdu_mlme.tid_rx_token[i] : 0);
p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.3x",
tid_rx ? tid_rx->ssn : 0);
@@ -517,6 +522,7 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
return;
DEBUGFS_ADD(flags);
+ DEBUGFS_ADD(aid);
DEBUGFS_ADD(num_ps_buf_frames);
DEBUGFS_ADD(last_seq_ctrl);
DEBUGFS_ADD(agg_status);
diff --git a/net/mac80211/fils_aead.c b/net/mac80211/fils_aead.c
new file mode 100644
index 000000000000..3cfb1e2ab7ac
--- /dev/null
+++ b/net/mac80211/fils_aead.c
@@ -0,0 +1,334 @@
+/*
+ * FILS AEAD for (Re)Association Request/Response frames
+ * Copyright 2016, Qualcomm Atheros, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
+
+#include "ieee80211_i.h"
+#include "aes_cmac.h"
+#include "fils_aead.h"
+
+static void gf_mulx(u8 *pad)
+{
+ u64 a = get_unaligned_be64(pad);
+ u64 b = get_unaligned_be64(pad + 8);
+
+ put_unaligned_be64((a << 1) | (b >> 63), pad);
+ put_unaligned_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0), pad + 8);
+}
+
+static int aes_s2v(struct crypto_shash *tfm,
+ size_t num_elem, const u8 *addr[], size_t len[], u8 *v)
+{
+ u8 d[AES_BLOCK_SIZE], tmp[AES_BLOCK_SIZE] = {};
+ SHASH_DESC_ON_STACK(desc, tfm);
+ size_t i;
+
+ desc->tfm = tfm;
+
+ /* D = AES-CMAC(K, <zero>) */
+ crypto_shash_digest(desc, tmp, AES_BLOCK_SIZE, d);
+
+ for (i = 0; i < num_elem - 1; i++) {
+ /* D = dbl(D) xor AES_CMAC(K, Si) */
+ gf_mulx(d); /* dbl */
+ crypto_shash_digest(desc, addr[i], len[i], tmp);
+ crypto_xor(d, tmp, AES_BLOCK_SIZE);
+ }
+
+ crypto_shash_init(desc);
+
+ if (len[i] >= AES_BLOCK_SIZE) {
+ /* len(Sn) >= 128 */
+ /* T = Sn xorend D */
+ crypto_shash_update(desc, addr[i], len[i] - AES_BLOCK_SIZE);
+ crypto_xor(d, addr[i] + len[i] - AES_BLOCK_SIZE,
+ AES_BLOCK_SIZE);
+ } else {
+ /* len(Sn) < 128 */
+ /* T = dbl(D) xor pad(Sn) */
+ gf_mulx(d); /* dbl */
+ crypto_xor(d, addr[i], len[i]);
+ d[len[i]] ^= 0x80;
+ }
+ /* V = AES-CMAC(K, T) */
+ crypto_shash_finup(desc, d, AES_BLOCK_SIZE, v);
+
+ return 0;
+}
+
+/* Note: addr[] and len[] needs to have one extra slot at the end. */
+static int aes_siv_encrypt(const u8 *key, size_t key_len,
+ const u8 *plain, size_t plain_len,
+ size_t num_elem, const u8 *addr[],
+ size_t len[], u8 *out)
+{
+ u8 v[AES_BLOCK_SIZE];
+ struct crypto_shash *tfm;
+ struct crypto_skcipher *tfm2;
+ struct skcipher_request *req;
+ int res;
+ struct scatterlist src[1], dst[1];
+ u8 *tmp;
+
+ key_len /= 2; /* S2V key || CTR key */
+
+ addr[num_elem] = plain;
+ len[num_elem] = plain_len;
+ num_elem++;
+
+ /* S2V */
+
+ tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+ /* K1 for S2V */
+ res = crypto_shash_setkey(tfm, key, key_len);
+ if (!res)
+ res = aes_s2v(tfm, num_elem, addr, len, v);
+ crypto_free_shash(tfm);
+ if (res)
+ return res;
+
+ /* Use a temporary buffer of the plaintext to handle need for
+ * overwriting this during AES-CTR.
+ */
+ tmp = kmemdup(plain, plain_len, GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+ /* IV for CTR before encrypted data */
+ memcpy(out, v, AES_BLOCK_SIZE);
+
+ /* Synthetic IV to be used as the initial counter in CTR:
+ * Q = V bitand (1^64 || 0^1 || 1^31 || 0^1 || 1^31)
+ */
+ v[8] &= 0x7f;
+ v[12] &= 0x7f;
+
+ /* CTR */
+
+ tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm2)) {
+ kfree(tmp);
+ return PTR_ERR(tfm2);
+ }
+ /* K2 for CTR */
+ res = crypto_skcipher_setkey(tfm2, key + key_len, key_len);
+ if (res)
+ goto fail;
+
+ req = skcipher_request_alloc(tfm2, GFP_KERNEL);
+ if (!req) {
+ res = -ENOMEM;
+ goto fail;
+ }
+
+ sg_init_one(src, tmp, plain_len);
+ sg_init_one(dst, out + AES_BLOCK_SIZE, plain_len);
+ skcipher_request_set_crypt(req, src, dst, plain_len, v);
+ res = crypto_skcipher_encrypt(req);
+ skcipher_request_free(req);
+fail:
+ kfree(tmp);
+ crypto_free_skcipher(tfm2);
+ return res;
+}
+
+/* Note: addr[] and len[] needs to have one extra slot at the end. */
+static int aes_siv_decrypt(const u8 *key, size_t key_len,
+ const u8 *iv_crypt, size_t iv_c_len,
+ size_t num_elem, const u8 *addr[], size_t len[],
+ u8 *out)
+{
+ struct crypto_shash *tfm;
+ struct crypto_skcipher *tfm2;
+ struct skcipher_request *req;
+ struct scatterlist src[1], dst[1];
+ size_t crypt_len;
+ int res;
+ u8 frame_iv[AES_BLOCK_SIZE], iv[AES_BLOCK_SIZE];
+ u8 check[AES_BLOCK_SIZE];
+
+ crypt_len = iv_c_len - AES_BLOCK_SIZE;
+ key_len /= 2; /* S2V key || CTR key */
+ addr[num_elem] = out;
+ len[num_elem] = crypt_len;
+ num_elem++;
+
+ memcpy(iv, iv_crypt, AES_BLOCK_SIZE);
+ memcpy(frame_iv, iv_crypt, AES_BLOCK_SIZE);
+
+ /* Synthetic IV to be used as the initial counter in CTR:
+ * Q = V bitand (1^64 || 0^1 || 1^31 || 0^1 || 1^31)
+ */
+ iv[8] &= 0x7f;
+ iv[12] &= 0x7f;
+
+ /* CTR */
+
+ tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm2))
+ return PTR_ERR(tfm2);
+ /* K2 for CTR */
+ res = crypto_skcipher_setkey(tfm2, key + key_len, key_len);
+ if (res) {
+ crypto_free_skcipher(tfm2);
+ return res;
+ }
+
+ req = skcipher_request_alloc(tfm2, GFP_KERNEL);
+ if (!req) {
+ crypto_free_skcipher(tfm2);
+ return -ENOMEM;
+ }
+
+ sg_init_one(src, iv_crypt + AES_BLOCK_SIZE, crypt_len);
+ sg_init_one(dst, out, crypt_len);
+ skcipher_request_set_crypt(req, src, dst, crypt_len, iv);
+ res = crypto_skcipher_decrypt(req);
+ skcipher_request_free(req);
+ crypto_free_skcipher(tfm2);
+ if (res)
+ return res;
+
+ /* S2V */
+
+ tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+ /* K1 for S2V */
+ res = crypto_shash_setkey(tfm, key, key_len);
+ if (!res)
+ res = aes_s2v(tfm, num_elem, addr, len, check);
+ crypto_free_shash(tfm);
+ if (res)
+ return res;
+ if (memcmp(check, frame_iv, AES_BLOCK_SIZE) != 0)
+ return -EINVAL;
+ return 0;
+}
+
+int fils_encrypt_assoc_req(struct sk_buff *skb,
+ struct ieee80211_mgd_assoc_data *assoc_data)
+{
+ struct ieee80211_mgmt *mgmt = (void *)skb->data;
+ u8 *capab, *ies, *encr;
+ const u8 *addr[5 + 1], *session;
+ size_t len[5 + 1];
+ size_t crypt_len;
+
+ if (ieee80211_is_reassoc_req(mgmt->frame_control)) {
+ capab = (u8 *)&mgmt->u.reassoc_req.capab_info;
+ ies = mgmt->u.reassoc_req.variable;
+ } else {
+ capab = (u8 *)&mgmt->u.assoc_req.capab_info;
+ ies = mgmt->u.assoc_req.variable;
+ }
+
+ session = cfg80211_find_ext_ie(WLAN_EID_EXT_FILS_SESSION,
+ ies, skb->data + skb->len - ies);
+ if (!session || session[1] != 1 + 8)
+ return -EINVAL;
+ /* encrypt after FILS Session element */
+ encr = (u8 *)session + 2 + 1 + 8;
+
+ /* AES-SIV AAD vectors */
+
+ /* The STA's MAC address */
+ addr[0] = mgmt->sa;
+ len[0] = ETH_ALEN;
+ /* The AP's BSSID */
+ addr[1] = mgmt->da;
+ len[1] = ETH_ALEN;
+ /* The STA's nonce */
+ addr[2] = assoc_data->fils_nonces;
+ len[2] = FILS_NONCE_LEN;
+ /* The AP's nonce */
+ addr[3] = &assoc_data->fils_nonces[FILS_NONCE_LEN];
+ len[3] = FILS_NONCE_LEN;
+ /* The (Re)Association Request frame from the Capability Information
+ * field to the FILS Session element (both inclusive).
+ */
+ addr[4] = capab;
+ len[4] = encr - capab;
+
+ crypt_len = skb->data + skb->len - encr;
+ skb_put(skb, AES_BLOCK_SIZE);
+ return aes_siv_encrypt(assoc_data->fils_kek, assoc_data->fils_kek_len,
+ encr, crypt_len, 5, addr, len, encr);
+}
+
+int fils_decrypt_assoc_resp(struct ieee80211_sub_if_data *sdata,
+ u8 *frame, size_t *frame_len,
+ struct ieee80211_mgd_assoc_data *assoc_data)
+{
+ struct ieee80211_mgmt *mgmt = (void *)frame;
+ u8 *capab, *ies, *encr;
+ const u8 *addr[5 + 1], *session;
+ size_t len[5 + 1];
+ int res;
+ size_t crypt_len;
+
+ if (*frame_len < 24 + 6)
+ return -EINVAL;
+
+ capab = (u8 *)&mgmt->u.assoc_resp.capab_info;
+ ies = mgmt->u.assoc_resp.variable;
+ session = cfg80211_find_ext_ie(WLAN_EID_EXT_FILS_SESSION,
+ ies, frame + *frame_len - ies);
+ if (!session || session[1] != 1 + 8) {
+ mlme_dbg(sdata,
+ "No (valid) FILS Session element in (Re)Association Response frame from %pM",
+ mgmt->sa);
+ return -EINVAL;
+ }
+ /* decrypt after FILS Session element */
+ encr = (u8 *)session + 2 + 1 + 8;
+
+ /* AES-SIV AAD vectors */
+
+ /* The AP's BSSID */
+ addr[0] = mgmt->sa;
+ len[0] = ETH_ALEN;
+ /* The STA's MAC address */
+ addr[1] = mgmt->da;
+ len[1] = ETH_ALEN;
+ /* The AP's nonce */
+ addr[2] = &assoc_data->fils_nonces[FILS_NONCE_LEN];
+ len[2] = FILS_NONCE_LEN;
+ /* The STA's nonce */
+ addr[3] = assoc_data->fils_nonces;
+ len[3] = FILS_NONCE_LEN;
+ /* The (Re)Association Response frame from the Capability Information
+ * field to the FILS Session element (both inclusive).
+ */
+ addr[4] = capab;
+ len[4] = encr - capab;
+
+ crypt_len = frame + *frame_len - encr;
+ if (crypt_len < AES_BLOCK_SIZE) {
+ mlme_dbg(sdata,
+ "Not enough room for AES-SIV data after FILS Session element in (Re)Association Response frame from %pM",
+ mgmt->sa);
+ return -EINVAL;
+ }
+ res = aes_siv_decrypt(assoc_data->fils_kek, assoc_data->fils_kek_len,
+ encr, crypt_len, 5, addr, len, encr);
+ if (res != 0) {
+ mlme_dbg(sdata,
+ "AES-SIV decryption of (Re)Association Response frame from %pM failed",
+ mgmt->sa);
+ return res;
+ }
+ *frame_len -= AES_BLOCK_SIZE;
+ return 0;
+}
diff --git a/net/mac80211/fils_aead.h b/net/mac80211/fils_aead.h
new file mode 100644
index 000000000000..fbc65232f0b3
--- /dev/null
+++ b/net/mac80211/fils_aead.h
@@ -0,0 +1,19 @@
+/*
+ * FILS AEAD for (Re)Association Request/Response frames
+ * Copyright 2016, Qualcomm Atheros, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef FILS_AEAD_H
+#define FILS_AEAD_H
+
+int fils_encrypt_assoc_req(struct sk_buff *skb,
+ struct ieee80211_mgd_assoc_data *assoc_data);
+int fils_decrypt_assoc_resp(struct ieee80211_sub_if_data *sdata,
+ u8 *frame, size_t *frame_len,
+ struct ieee80211_mgd_assoc_data *assoc_data);
+
+#endif /* FILS_AEAD_H */
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index a31d30713d08..98999d3d5262 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -487,14 +487,14 @@ int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata,
struct beacon_data *presp, *old_presp;
struct cfg80211_bss *cbss;
const struct cfg80211_bss_ies *ies;
- u16 capability = 0;
+ u16 capability = WLAN_CAPABILITY_IBSS;
u64 tsf;
int ret = 0;
sdata_assert_lock(sdata);
if (ifibss->privacy)
- capability = WLAN_CAPABILITY_PRIVACY;
+ capability |= WLAN_CAPABILITY_PRIVACY;
cbss = cfg80211_get_bss(sdata->local->hw.wiphy, ifibss->chandef.chan,
ifibss->bssid, ifibss->ssid,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 34c2add2c455..0e718437d080 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -84,6 +84,8 @@ struct ieee80211_local;
#define IEEE80211_DEFAULT_MAX_SP_LEN \
IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL
+extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS];
+
#define IEEE80211_DEAUTH_FRAME_LEN (24 /* hdr */ + 2 /* reason */)
#define IEEE80211_MAX_NAN_INSTANCE_ID 255
@@ -157,7 +159,7 @@ enum ieee80211_bss_valid_data_flags {
IEEE80211_BSS_VALID_ERP = BIT(3)
};
-typedef unsigned __bitwise__ ieee80211_tx_result;
+typedef unsigned __bitwise ieee80211_tx_result;
#define TX_CONTINUE ((__force ieee80211_tx_result) 0u)
#define TX_DROP ((__force ieee80211_tx_result) 1u)
#define TX_QUEUED ((__force ieee80211_tx_result) 2u)
@@ -178,7 +180,7 @@ struct ieee80211_tx_data {
};
-typedef unsigned __bitwise__ ieee80211_rx_result;
+typedef unsigned __bitwise ieee80211_rx_result;
#define RX_CONTINUE ((__force ieee80211_rx_result) 0u)
#define RX_DROP_UNUSABLE ((__force ieee80211_rx_result) 1u)
#define RX_DROP_MONITOR ((__force ieee80211_rx_result) 2u)
@@ -295,6 +297,7 @@ struct ieee80211_if_ap {
driver_smps_mode; /* smps mode request */
struct work_struct request_smps_work;
+ bool multicast_to_unicast;
};
struct ieee80211_if_wds {
@@ -307,6 +310,7 @@ struct ieee80211_if_vlan {
/* used for all tx if the VLAN is configured to 4-addr mode */
struct sta_info __rcu *sta;
+ atomic_t num_mcast_sta; /* number of stations receiving multicast */
};
struct mesh_stats {
@@ -398,6 +402,10 @@ struct ieee80211_mgd_assoc_data {
struct ieee80211_vht_cap ap_vht_cap;
+ u8 fils_nonces[2 * FILS_NONCE_LEN];
+ u8 fils_kek[FILS_MAX_KEK_LEN];
+ size_t fils_kek_len;
+
size_t ie_len;
u8 ie[];
};
@@ -420,7 +428,7 @@ struct ieee80211_sta_tx_tspec {
bool downgraded;
};
-DECLARE_EWMA(beacon_signal, 16, 4)
+DECLARE_EWMA(beacon_signal, 4, 4)
struct ieee80211_if_managed {
struct timer_list timer;
@@ -442,7 +450,7 @@ struct ieee80211_if_managed {
struct ieee80211_mgd_auth_data *auth_data;
struct ieee80211_mgd_assoc_data *assoc_data;
- u8 bssid[ETH_ALEN];
+ u8 bssid[ETH_ALEN] __aligned(2);
u16 aid;
@@ -617,8 +625,8 @@ struct ieee80211_mesh_sync_ops {
struct ieee80211_rx_status *rx_status);
/* should be called with beacon_data under RCU read lock */
- void (*adjust_tbtt)(struct ieee80211_sub_if_data *sdata,
- struct beacon_data *beacon);
+ void (*adjust_tsf)(struct ieee80211_sub_if_data *sdata,
+ struct beacon_data *beacon);
/* add other framework functions here */
};
@@ -681,7 +689,6 @@ struct ieee80211_if_mesh {
const struct ieee80211_mesh_sync_ops *sync_ops;
s64 sync_offset_clockdrift_max;
spinlock_t sync_offset_lock;
- bool adjusting_tbtt;
/* mesh power save */
enum nl80211_mesh_power_mode nonpeer_pm;
int ps_peers_light_sleep;
@@ -1527,6 +1534,23 @@ ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status)
return false;
}
+void ieee80211_vif_inc_num_mcast(struct ieee80211_sub_if_data *sdata);
+void ieee80211_vif_dec_num_mcast(struct ieee80211_sub_if_data *sdata);
+
+/* This function returns the number of multicast stations connected to this
+ * interface. It returns -1 if that number is not tracked, that is for netdevs
+ * not in AP or AP_VLAN mode or when using 4addr.
+ */
+static inline int
+ieee80211_vif_get_num_mcast_if(struct ieee80211_sub_if_data *sdata)
+{
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ return atomic_read(&sdata->u.ap.num_mcast_sta);
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && !sdata->u.vlan.sta)
+ return atomic_read(&sdata->u.vlan.num_mcast_sta);
+ return -1;
+}
+
u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
struct ieee80211_rx_status *status,
unsigned int mpdu_len,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 638ec0759078..5bb0c5012819 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -6,6 +6,7 @@
* Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
* Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright (c) 2016 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -150,15 +151,6 @@ void ieee80211_recalc_idle(struct ieee80211_local *local)
ieee80211_hw_config(local, change);
}
-static int ieee80211_change_mtu(struct net_device *dev, int new_mtu)
-{
- if (new_mtu < 256 || new_mtu > IEEE80211_MAX_DATA_LEN)
- return -EINVAL;
-
- dev->mtu = new_mtu;
- return 0;
-}
-
static int ieee80211_verify_mac(struct ieee80211_sub_if_data *sdata, u8 *addr,
bool check_dup)
{
@@ -726,7 +718,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
ieee80211_recalc_ps(local);
if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
- sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
+ local->ops->wake_tx_queue) {
/* XXX: for AP_VLAN, actually track AP queues */
netif_tx_start_all_queues(dev);
} else if (dev) {
@@ -1131,7 +1124,7 @@ static u16 ieee80211_netdev_select_queue(struct net_device *dev,
return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb);
}
-static struct rtnl_link_stats64 *
+static void
ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
int i;
@@ -1156,8 +1149,6 @@ ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
stats->rx_bytes += rx_bytes;
stats->tx_bytes += tx_bytes;
}
-
- return stats;
}
static const struct net_device_ops ieee80211_dataif_ops = {
@@ -1166,7 +1157,6 @@ static const struct net_device_ops ieee80211_dataif_ops = {
.ndo_uninit = ieee80211_uninit,
.ndo_start_xmit = ieee80211_subif_start_xmit,
.ndo_set_rx_mode = ieee80211_set_multicast_list,
- .ndo_change_mtu = ieee80211_change_mtu,
.ndo_set_mac_address = ieee80211_change_mac,
.ndo_select_queue = ieee80211_netdev_select_queue,
.ndo_get_stats64 = ieee80211_get_stats64,
@@ -1200,7 +1190,6 @@ static const struct net_device_ops ieee80211_monitorif_ops = {
.ndo_uninit = ieee80211_uninit,
.ndo_start_xmit = ieee80211_monitor_start_xmit,
.ndo_set_rx_mode = ieee80211_set_multicast_list,
- .ndo_change_mtu = ieee80211_change_mtu,
.ndo_set_mac_address = ieee80211_change_mac,
.ndo_select_queue = ieee80211_monitor_select_queue,
.ndo_get_stats64 = ieee80211_get_stats64,
@@ -1306,6 +1295,26 @@ static void ieee80211_iface_work(struct work_struct *work)
} else if (ieee80211_is_action(mgmt->frame_control) &&
mgmt->u.action.category == WLAN_CATEGORY_VHT) {
switch (mgmt->u.action.u.vht_group_notif.action_code) {
+ case WLAN_VHT_ACTION_OPMODE_NOTIF: {
+ struct ieee80211_rx_status *status;
+ enum nl80211_band band;
+ u8 opmode;
+
+ status = IEEE80211_SKB_RXCB(skb);
+ band = status->band;
+ opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode;
+
+ mutex_lock(&local->sta_mtx);
+ sta = sta_info_get_bss(sdata, mgmt->sa);
+
+ if (sta)
+ ieee80211_vht_handle_opmode(sdata, sta,
+ opmode,
+ band);
+
+ mutex_unlock(&local->sta_mtx);
+ break;
+ }
case WLAN_VHT_ACTION_GROUPID_MGMT:
ieee80211_process_mu_groups(sdata, mgmt);
break;
@@ -1884,6 +1893,10 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
netdev_set_default_ethtool_ops(ndev, &ieee80211_ethtool_ops);
+ /* MTU range: 256 - 2304 */
+ ndev->min_mtu = 256;
+ ndev->max_mtu = IEEE80211_MAX_DATA_LEN;
+
ret = register_netdevice(ndev);
if (ret) {
ieee80211_if_free(ndev);
@@ -2005,3 +2018,19 @@ void ieee80211_iface_exit(void)
{
unregister_netdevice_notifier(&mac80211_netdev_notifier);
}
+
+void ieee80211_vif_inc_num_mcast(struct ieee80211_sub_if_data *sdata)
+{
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ atomic_inc(&sdata->u.ap.num_mcast_sta);
+ else if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ atomic_inc(&sdata->u.vlan.num_mcast_sta);
+}
+
+void ieee80211_vif_dec_num_mcast(struct ieee80211_sub_if_data *sdata)
+{
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ atomic_dec(&sdata->u.ap.num_mcast_sta);
+ else if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ atomic_dec(&sdata->u.vlan.num_mcast_sta);
+}
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index edd6f2945f69..a98fc2b5e0dc 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -265,7 +265,8 @@ static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata,
if (uni) {
rcu_assign_pointer(sdata->default_unicast_key, key);
ieee80211_check_fast_xmit_iface(sdata);
- drv_set_default_unicast_key(sdata->local, sdata, idx);
+ if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN)
+ drv_set_default_unicast_key(sdata->local, sdata, idx);
}
if (multi)
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index 4aa20cef0859..ebdb80b85dc3 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -93,7 +93,7 @@ struct ieee80211_key {
} ccmp;
struct {
u8 rx_pn[IEEE80211_CMAC_PN_LEN];
- struct crypto_cipher *tfm;
+ struct crypto_shash *tfm;
u32 replays; /* dot11RSNAStatsCMACReplays */
u32 icverrors; /* dot11RSNAStatsCMACICVErrors */
} aes_cmac;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 1075ac24c8c5..56fb47953b72 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -549,6 +549,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
NL80211_FEATURE_MAC_ON_CREATE |
NL80211_FEATURE_USERSPACE_MPM |
NL80211_FEATURE_FULL_AP_CLIENT_STATE;
+ wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_FILS_STA);
if (!ops->hw_scan)
wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN |
@@ -821,6 +822,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
!local->ops->tdls_recv_channel_switch))
return -EOPNOTSUPP;
+ if (WARN_ON(ieee80211_hw_check(hw, SUPPORTS_TX_FRAG) &&
+ !local->ops->set_frag_threshold))
+ return -EINVAL;
+
if (WARN_ON(local->hw.wiphy->interface_modes &
BIT(NL80211_IFTYPE_NAN) &&
(!local->ops->start_nan || !local->ops->stop_nan)))
@@ -908,12 +913,17 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
supp_ht = supp_ht || sband->ht_cap.ht_supported;
supp_vht = supp_vht || sband->vht_cap.vht_supported;
- if (sband->ht_cap.ht_supported)
- local->rx_chains =
- max(ieee80211_mcs_to_chains(&sband->ht_cap.mcs),
- local->rx_chains);
+ if (!sband->ht_cap.ht_supported)
+ continue;
/* TODO: consider VHT for RX chains, hopefully it's the same */
+ local->rx_chains =
+ max(ieee80211_mcs_to_chains(&sband->ht_cap.mcs),
+ local->rx_chains);
+
+ /* no need to mask, SM_PS_DISABLED has all bits set */
+ sband->ht_cap.cap |= WLAN_HT_CAP_SM_PS_DISABLED <<
+ IEEE80211_HT_CAP_SM_PS_SHIFT;
}
/* if low-level driver supports AP, we also support VLAN */
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 42120d965263..6e7b6a07b7d5 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -279,10 +279,6 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata,
/* Mesh PS mode. See IEEE802.11-2012 8.4.2.100.8 */
*pos |= ifmsh->ps_peers_deep_sleep ?
IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL : 0x00;
- *pos++ |= ifmsh->adjusting_tbtt ?
- IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING : 0x00;
- *pos++ = 0x00;
-
return 0;
}
@@ -339,7 +335,7 @@ int mesh_add_vendor_ies(struct ieee80211_sub_if_data *sdata,
/* fast-forward to vendor IEs */
offset = ieee80211_ie_split_vendor(ifmsh->ie, ifmsh->ie_len, 0);
- if (offset) {
+ if (offset < ifmsh->ie_len) {
len = ifmsh->ie_len - offset;
data = ifmsh->ie + offset;
if (skb_tailroom(skb) < len)
@@ -685,7 +681,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
2 + /* NULL SSID */
/* Channel Switch Announcement */
2 + sizeof(struct ieee80211_channel_sw_ie) +
- /* Mesh Channel Swith Parameters */
+ /* Mesh Channel Switch Parameters */
2 + sizeof(struct ieee80211_mesh_chansw_params_ie) +
2 + 8 + /* supported rates */
2 + 3; /* DS params */
@@ -850,7 +846,6 @@ int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
ifmsh->mesh_cc_id = 0; /* Disabled */
/* register sync ops from extensible synchronization framework */
ifmsh->sync_ops = ieee80211_mesh_sync_ops_get(ifmsh->mesh_sp_id);
- ifmsh->adjusting_tbtt = false;
ifmsh->sync_offset_clockdrift_max = 0;
set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags);
ieee80211_mesh_root_setup(ifmsh);
@@ -1349,7 +1344,7 @@ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata)
ieee80211_mesh_rootpath(sdata);
if (test_and_clear_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags))
- mesh_sync_adjust_tbtt(sdata);
+ mesh_sync_adjust_tsf(sdata);
if (test_and_clear_bit(MESH_WORK_MBSS_CHANGED, &ifmsh->wrkq_flags))
mesh_bss_info_changed(sdata);
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index 26b9ccbe1fce..7e5f271e3c30 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -341,7 +341,7 @@ static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
}
void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata);
-void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata);
+void mesh_sync_adjust_tsf(struct ieee80211_sub_if_data *sdata);
void ieee80211s_stop(void);
#else
static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 7fcdcf622655..953d71e784a9 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -9,6 +9,8 @@
#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/random.h>
+#include <linux/rculist.h>
+
#include "ieee80211_i.h"
#include "rate.h"
#include "mesh.h"
@@ -505,12 +507,14 @@ mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr,
/* Userspace handles station allocation */
if (sdata->u.mesh.user_mpm ||
- sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED)
- cfg80211_notify_new_peer_candidate(sdata->dev, addr,
- elems->ie_start,
- elems->total_len,
- GFP_KERNEL);
- else
+ sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) {
+ if (mesh_peer_accepts_plinks(elems) &&
+ mesh_plink_availables(sdata))
+ cfg80211_notify_new_peer_candidate(sdata->dev, addr,
+ elems->ie_start,
+ elems->total_len,
+ GFP_KERNEL);
+ } else
sta = __mesh_sta_info_alloc(sdata, addr);
return sta;
diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c
index faca22cd02b5..a435f094a82e 100644
--- a/net/mac80211/mesh_sync.c
+++ b/net/mac80211/mesh_sync.c
@@ -12,7 +12,7 @@
#include "mesh.h"
#include "driver-ops.h"
-/* This is not in the standard. It represents a tolerable tbtt drift below
+/* This is not in the standard. It represents a tolerable tsf drift below
* which we do no TSF adjustment.
*/
#define TOFFSET_MINIMUM_ADJUSTMENT 10
@@ -46,7 +46,7 @@ static bool mesh_peer_tbtt_adjusting(struct ieee802_11_elems *ie)
IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING) != 0;
}
-void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata)
+void mesh_sync_adjust_tsf(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
@@ -57,12 +57,12 @@ void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata)
spin_lock_bh(&ifmsh->sync_offset_lock);
if (ifmsh->sync_offset_clockdrift_max < beacon_int_fraction) {
- msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting\n",
+ msync_dbg(sdata, "TSF : max clockdrift=%lld; adjusting\n",
(long long) ifmsh->sync_offset_clockdrift_max);
tsfdelta = -ifmsh->sync_offset_clockdrift_max;
ifmsh->sync_offset_clockdrift_max = 0;
} else {
- msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting by %llu\n",
+ msync_dbg(sdata, "TSF : max clockdrift=%lld; adjusting by %llu\n",
(long long) ifmsh->sync_offset_clockdrift_max,
(unsigned long long) beacon_int_fraction);
tsfdelta = -beacon_int_fraction;
@@ -123,7 +123,6 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
*/
if (elems->mesh_config && mesh_peer_tbtt_adjusting(elems)) {
- clear_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN);
msync_dbg(sdata, "STA %pM : is adjusting TBTT\n",
sta->sta.addr);
goto no_sync;
@@ -168,15 +167,13 @@ no_sync:
rcu_read_unlock();
}
-static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata,
+static void mesh_sync_offset_adjust_tsf(struct ieee80211_sub_if_data *sdata,
struct beacon_data *beacon)
{
struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
- u8 cap;
WARN_ON(ifmsh->mesh_sp_id != IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET);
WARN_ON(!rcu_read_lock_held());
- cap = beacon->meshconf->meshconf_cap;
spin_lock_bh(&ifmsh->sync_offset_lock);
@@ -187,24 +184,16 @@ static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata,
* the tsf adjustment to the mesh tasklet
*/
msync_dbg(sdata,
- "TBTT : kicking off TBTT adjustment with clockdrift_max=%lld\n",
+ "TSF : kicking off TSF adjustment with clockdrift_max=%lld\n",
ifmsh->sync_offset_clockdrift_max);
set_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags);
-
- ifmsh->adjusting_tbtt = true;
} else {
msync_dbg(sdata,
- "TBTT : max clockdrift=%lld; too small to adjust\n",
+ "TSF : max clockdrift=%lld; too small to adjust\n",
(long long)ifmsh->sync_offset_clockdrift_max);
ifmsh->sync_offset_clockdrift_max = 0;
-
- ifmsh->adjusting_tbtt = false;
}
spin_unlock_bh(&ifmsh->sync_offset_lock);
-
- beacon->meshconf->meshconf_cap = ifmsh->adjusting_tbtt ?
- IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING | cap :
- ~IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING & cap;
}
static const struct sync_method sync_methods[] = {
@@ -212,7 +201,7 @@ static const struct sync_method sync_methods[] = {
.method = IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET,
.ops = {
.rx_bcn_presp = &mesh_sync_offset_rx_bcn_presp,
- .adjust_tbtt = &mesh_sync_offset_adjust_tbtt,
+ .adjust_tsf = &mesh_sync_offset_adjust_tsf,
}
},
};
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 7486f2dab4ba..6e90301154d5 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -30,6 +30,7 @@
#include "driver-ops.h"
#include "rate.h"
#include "led.h"
+#include "fils_aead.h"
#define IEEE80211_AUTH_TIMEOUT (HZ / 5)
#define IEEE80211_AUTH_TIMEOUT_LONG (HZ / 2)
@@ -652,6 +653,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
2 + sizeof(struct ieee80211_ht_cap) + /* HT */
2 + sizeof(struct ieee80211_vht_cap) + /* VHT */
assoc_data->ie_len + /* extra IEs */
+ (assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) +
9, /* WMM */
GFP_KERNEL);
if (!skb)
@@ -875,6 +877,12 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
memcpy(pos, assoc_data->ie + offset, noffset - offset);
}
+ if (assoc_data->fils_kek_len &&
+ fils_encrypt_assoc_req(skb, assoc_data) < 0) {
+ dev_kfree_skb(skb);
+ return;
+ }
+
drv_mgd_prepare_tx(local, sdata);
IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
@@ -1478,10 +1486,6 @@ void ieee80211_recalc_ps(struct ieee80211_local *local)
if (count == 1 && ieee80211_powersave_allowed(found)) {
u8 dtimper = found->u.mgd.dtim_period;
- s32 beaconint_us;
-
- beaconint_us = ieee80211_tu_to_usec(
- found->vif.bss_conf.beacon_int);
timeout = local->dynamic_ps_forced_timeout;
if (timeout < 0)
@@ -2510,7 +2514,7 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata,
}
static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
- bool assoc)
+ bool assoc, bool abandon)
{
struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data;
@@ -2533,6 +2537,9 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
mutex_lock(&sdata->local->mtx);
ieee80211_vif_release_channel(sdata);
mutex_unlock(&sdata->local->mtx);
+
+ if (abandon)
+ cfg80211_abandon_assoc(sdata->dev, assoc_data->bss);
}
kfree(assoc_data);
@@ -2618,6 +2625,9 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
case WLAN_AUTH_LEAP:
case WLAN_AUTH_FT:
case WLAN_AUTH_SAE:
+ case WLAN_AUTH_FILS_SK:
+ case WLAN_AUTH_FILS_SK_PFS:
+ case WLAN_AUTH_FILS_PK:
break;
case WLAN_AUTH_SHARED_KEY:
if (ifmgd->auth_data->expected_transaction != 4) {
@@ -2762,7 +2772,7 @@ static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
bssid, reason_code,
ieee80211_get_reason_code_string(reason_code));
- ieee80211_destroy_assoc_data(sdata, false);
+ ieee80211_destroy_assoc_data(sdata, false, true);
cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len);
return;
@@ -3143,6 +3153,10 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
reassoc ? "Rea" : "A", mgmt->sa,
capab_info, status_code, (u16)(aid & ~(BIT(15) | BIT(14))));
+ if (assoc_data->fils_kek_len &&
+ fils_decrypt_assoc_resp(sdata, (u8 *)mgmt, &len, assoc_data) < 0)
+ return;
+
pos = mgmt->u.assoc_resp.variable;
ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), false, &elems);
@@ -3167,14 +3181,14 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
if (status_code != WLAN_STATUS_SUCCESS) {
sdata_info(sdata, "%pM denied association (code=%d)\n",
mgmt->sa, status_code);
- ieee80211_destroy_assoc_data(sdata, false);
+ ieee80211_destroy_assoc_data(sdata, false, false);
event.u.mlme.status = MLME_DENIED;
event.u.mlme.reason = status_code;
drv_event_callback(sdata->local, sdata, &event);
} else {
if (!ieee80211_assoc_success(sdata, bss, mgmt, len)) {
/* oops -- internal error -- send timeout for now */
- ieee80211_destroy_assoc_data(sdata, false);
+ ieee80211_destroy_assoc_data(sdata, false, false);
cfg80211_assoc_timeout(sdata->dev, bss);
return;
}
@@ -3187,13 +3201,13 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
* recalc after assoc_data is NULL but before associated
* is set can cause the interface to go idle
*/
- ieee80211_destroy_assoc_data(sdata, true);
+ ieee80211_destroy_assoc_data(sdata, true, false);
/* get uapsd queues configuration */
uapsd_queues = 0;
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
if (sdata->tx_conf[ac].uapsd)
- uapsd_queues |= BIT(ac);
+ uapsd_queues |= ieee80211_ac_to_qos_mask[ac];
}
cfg80211_rx_assoc_resp(sdata->dev, bss, (u8 *)mgmt, len, uapsd_queues);
@@ -3405,14 +3419,14 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
ieee80211_cqm_rssi_notify(
&sdata->vif,
NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW,
- GFP_KERNEL);
+ sig, GFP_KERNEL);
} else if (sig > thold &&
(last_event == 0 || sig > last_event + hyst)) {
ifmgd->last_cqm_event_signal = sig;
ieee80211_cqm_rssi_notify(
&sdata->vif,
NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH,
- GFP_KERNEL);
+ sig, GFP_KERNEL);
}
}
@@ -3886,7 +3900,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
.u.mlme.status = MLME_TIMEOUT,
};
- ieee80211_destroy_assoc_data(sdata, false);
+ ieee80211_destroy_assoc_data(sdata, false, false);
cfg80211_assoc_timeout(sdata->dev, bss);
drv_event_callback(sdata->local, sdata, &event);
}
@@ -4025,7 +4039,7 @@ void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata)
WLAN_REASON_DEAUTH_LEAVING,
false, frame_buf);
if (ifmgd->assoc_data)
- ieee80211_destroy_assoc_data(sdata, false);
+ ieee80211_destroy_assoc_data(sdata, false, true);
if (ifmgd->auth_data)
ieee80211_destroy_auth_data(sdata, false);
cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf,
@@ -4479,24 +4493,36 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
case NL80211_AUTHTYPE_SAE:
auth_alg = WLAN_AUTH_SAE;
break;
+ case NL80211_AUTHTYPE_FILS_SK:
+ auth_alg = WLAN_AUTH_FILS_SK;
+ break;
+ case NL80211_AUTHTYPE_FILS_SK_PFS:
+ auth_alg = WLAN_AUTH_FILS_SK_PFS;
+ break;
+ case NL80211_AUTHTYPE_FILS_PK:
+ auth_alg = WLAN_AUTH_FILS_PK;
+ break;
default:
return -EOPNOTSUPP;
}
- auth_data = kzalloc(sizeof(*auth_data) + req->sae_data_len +
+ auth_data = kzalloc(sizeof(*auth_data) + req->auth_data_len +
req->ie_len, GFP_KERNEL);
if (!auth_data)
return -ENOMEM;
auth_data->bss = req->bss;
- if (req->sae_data_len >= 4) {
- __le16 *pos = (__le16 *) req->sae_data;
- auth_data->sae_trans = le16_to_cpu(pos[0]);
- auth_data->sae_status = le16_to_cpu(pos[1]);
- memcpy(auth_data->data, req->sae_data + 4,
- req->sae_data_len - 4);
- auth_data->data_len += req->sae_data_len - 4;
+ if (req->auth_data_len >= 4) {
+ if (req->auth_type == NL80211_AUTHTYPE_SAE) {
+ __le16 *pos = (__le16 *) req->auth_data;
+
+ auth_data->sae_trans = le16_to_cpu(pos[0]);
+ auth_data->sae_status = le16_to_cpu(pos[1]);
+ }
+ memcpy(auth_data->data, req->auth_data + 4,
+ req->auth_data_len - 4);
+ auth_data->data_len += req->auth_data_len - 4;
}
if (req->ie && req->ie_len) {
@@ -4692,6 +4718,21 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
assoc_data->ie_len = req->ie_len;
}
+ if (req->fils_kek) {
+ /* should already be checked in cfg80211 - so warn */
+ if (WARN_ON(req->fils_kek_len > FILS_MAX_KEK_LEN)) {
+ err = -EINVAL;
+ goto err_free;
+ }
+ memcpy(assoc_data->fils_kek, req->fils_kek,
+ req->fils_kek_len);
+ assoc_data->fils_kek_len = req->fils_kek_len;
+ }
+
+ if (req->fils_nonces)
+ memcpy(assoc_data->fils_nonces, req->fils_nonces,
+ 2 * FILS_NONCE_LEN);
+
assoc_data->bss = req->bss;
if (ifmgd->req_smps == IEEE80211_SMPS_AUTOMATIC) {
@@ -4907,7 +4948,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
IEEE80211_STYPE_DEAUTH,
req->reason_code, tx,
frame_buf);
- ieee80211_destroy_assoc_data(sdata, false);
+ ieee80211_destroy_assoc_data(sdata, false, true);
ieee80211_report_disconnect(sdata, frame_buf,
sizeof(frame_buf), true,
req->reason_code);
@@ -4982,7 +5023,7 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata)
sdata_lock(sdata);
if (ifmgd->assoc_data) {
struct cfg80211_bss *bss = ifmgd->assoc_data->bss;
- ieee80211_destroy_assoc_data(sdata, false);
+ ieee80211_destroy_assoc_data(sdata, false, false);
cfg80211_assoc_timeout(sdata->dev, bss);
}
if (ifmgd->auth_data)
@@ -5000,13 +5041,14 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata)
void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif,
enum nl80211_cqm_rssi_threshold_event rssi_event,
+ s32 rssi_level,
gfp_t gfp)
{
struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
- trace_api_cqm_rssi_notify(sdata, rssi_event);
+ trace_api_cqm_rssi_notify(sdata, rssi_event, rssi_level);
- cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, gfp);
+ cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, rssi_level, gfp);
}
EXPORT_SYMBOL(ieee80211_cqm_rssi_notify);
diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c
index 28a3a0957c9e..76a8bcd8ef11 100644
--- a/net/mac80211/pm.c
+++ b/net/mac80211/pm.c
@@ -168,6 +168,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan)
break;
}
+ flush_delayed_work(&sdata->dec_tailroom_needed_wk);
drv_remove_interface(local, sdata);
}
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index 14c5ba3a1b1c..3ebe4405a2d4 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -159,21 +159,23 @@ minstrel_update_rates(struct minstrel_priv *mp, struct minstrel_sta_info *mi)
void
minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs)
{
+ unsigned int cur_prob;
+
if (unlikely(mrs->attempts > 0)) {
mrs->sample_skipped = 0;
- mrs->cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts);
+ cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts);
if (unlikely(!mrs->att_hist)) {
- mrs->prob_ewma = mrs->cur_prob;
+ mrs->prob_ewma = cur_prob;
} else {
/* update exponential weighted moving variance */
- mrs->prob_ewmsd = minstrel_ewmsd(mrs->prob_ewmsd,
- mrs->cur_prob,
- mrs->prob_ewma,
- EWMA_LEVEL);
+ mrs->prob_ewmv = minstrel_ewmv(mrs->prob_ewmv,
+ cur_prob,
+ mrs->prob_ewma,
+ EWMA_LEVEL);
/*update exponential weighted moving avarage */
mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma,
- mrs->cur_prob,
+ cur_prob,
EWMA_LEVEL);
}
mrs->att_hist += mrs->attempts;
@@ -365,6 +367,11 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta,
return;
#endif
+ /* Don't use EAPOL frames for sampling on non-mrr hw */
+ if (mp->hw->max_rates == 1 &&
+ (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO))
+ return;
+
delta = (mi->total_packets * sampling_ratio / 100) -
(mi->sample_packets + mi->sample_deferred / 2);
diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h
index c230bbe93262..be6c3f35f48b 100644
--- a/net/mac80211/rc80211_minstrel.h
+++ b/net/mac80211/rc80211_minstrel.h
@@ -14,7 +14,7 @@
#define SAMPLE_COLUMNS 10 /* number of columns in sample table */
/* scaled fraction values */
-#define MINSTREL_SCALE 16
+#define MINSTREL_SCALE 12
#define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div)
#define MINSTREL_TRUNC(val) ((val) >> MINSTREL_SCALE)
@@ -36,21 +36,16 @@ minstrel_ewma(int old, int new, int weight)
}
/*
- * Perform EWMSD (Exponentially Weighted Moving Standard Deviation) calculation
+ * Perform EWMV (Exponentially Weighted Moving Variance) calculation
*/
static inline int
-minstrel_ewmsd(int old_ewmsd, int cur_prob, int prob_ewma, int weight)
+minstrel_ewmv(int old_ewmv, int cur_prob, int prob_ewma, int weight)
{
- int diff, incr, tmp_var;
+ int diff, incr;
- /* calculate exponential weighted moving variance */
- diff = MINSTREL_TRUNC((cur_prob - prob_ewma) * 1000000);
+ diff = cur_prob - prob_ewma;
incr = (EWMA_DIV - weight) * diff / EWMA_DIV;
- tmp_var = old_ewmsd * old_ewmsd;
- tmp_var = weight * (tmp_var + diff * incr / 1000000) / EWMA_DIV;
-
- /* return standard deviation */
- return (u16) int_sqrt(tmp_var);
+ return weight * (old_ewmv + MINSTREL_TRUNC(diff * incr)) / EWMA_DIV;
}
struct minstrel_rate_stats {
@@ -59,15 +54,13 @@ struct minstrel_rate_stats {
u16 success, last_success;
/* total attempts/success counters */
- u64 att_hist, succ_hist;
+ u32 att_hist, succ_hist;
/* statistis of packet delivery probability
- * cur_prob - current prob within last update intervall
* prob_ewma - exponential weighted moving average of prob
* prob_ewmsd - exp. weighted moving standard deviation of prob */
- unsigned int cur_prob;
- unsigned int prob_ewma;
- u16 prob_ewmsd;
+ u16 prob_ewma;
+ u16 prob_ewmv;
/* maximum retry counts */
u8 retry_count;
@@ -153,6 +146,14 @@ struct minstrel_debugfs_info {
char buf[];
};
+/* Get EWMSD (Exponentially Weighted Moving Standard Deviation) * 10 */
+static inline int
+minstrel_get_ewmsd10(struct minstrel_rate_stats *mrs)
+{
+ unsigned int ewmv = mrs->prob_ewmv;
+ return int_sqrt(MINSTREL_TRUNC(ewmv * 1000 * 1000));
+}
+
extern const struct rate_control_ops mac80211_minstrel;
void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir);
void minstrel_remove_sta_debugfs(void *priv, void *priv_sta);
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
index 820b0abc9c0d..36fc971deb86 100644
--- a/net/mac80211/rc80211_minstrel_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -75,7 +75,7 @@ minstrel_stats_open(struct inode *inode, struct file *file)
{
struct minstrel_sta_info *mi = inode->i_private;
struct minstrel_debugfs_info *ms;
- unsigned int i, tp_max, tp_avg, prob, eprob;
+ unsigned int i, tp_max, tp_avg, eprob;
char *p;
ms = kmalloc(2048, GFP_KERNEL);
@@ -86,13 +86,14 @@ minstrel_stats_open(struct inode *inode, struct file *file)
p = ms->buf;
p += sprintf(p, "\n");
p += sprintf(p,
- "best __________rate_________ ________statistics________ ________last_______ ______sum-of________\n");
+ "best __________rate_________ ________statistics________ ____last_____ ______sum-of________\n");
p += sprintf(p,
- "rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [prob.|retry|suc|att] [#success | #attempts]\n");
+ "rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [retry|suc|att] [#success | #attempts]\n");
for (i = 0; i < mi->n_rates; i++) {
struct minstrel_rate *mr = &mi->r[i];
struct minstrel_rate_stats *mrs = &mi->r[i].stats;
+ unsigned int prob_ewmsd;
*(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' ';
*(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' ';
@@ -107,17 +108,16 @@ minstrel_stats_open(struct inode *inode, struct file *file)
tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
- prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+ prob_ewmsd = minstrel_get_ewmsd10(mrs);
p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u"
- " %3u.%1u %3u %3u %-3u "
+ " %3u %3u %-3u "
"%9llu %-9llu\n",
tp_max / 10, tp_max % 10,
tp_avg / 10, tp_avg % 10,
eprob / 10, eprob % 10,
- mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
- prob / 10, prob % 10,
+ prob_ewmsd / 10, prob_ewmsd % 10,
mrs->retry_count,
mrs->last_success,
mrs->last_attempts,
@@ -148,7 +148,7 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
{
struct minstrel_sta_info *mi = inode->i_private;
struct minstrel_debugfs_info *ms;
- unsigned int i, tp_max, tp_avg, prob, eprob;
+ unsigned int i, tp_max, tp_avg, eprob;
char *p;
ms = kmalloc(2048, GFP_KERNEL);
@@ -161,6 +161,7 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
for (i = 0; i < mi->n_rates; i++) {
struct minstrel_rate *mr = &mi->r[i];
struct minstrel_rate_stats *mrs = &mi->r[i].stats;
+ unsigned int prob_ewmsd;
p += sprintf(p, "%s" ,((i == mi->max_tp_rate[0]) ? "A" : ""));
p += sprintf(p, "%s" ,((i == mi->max_tp_rate[1]) ? "B" : ""));
@@ -175,16 +176,15 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
- prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+ prob_ewmsd = minstrel_get_ewmsd10(mrs);
- p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u,"
+ p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u,"
"%llu,%llu,%d,%d\n",
tp_max / 10, tp_max % 10,
tp_avg / 10, tp_avg % 10,
eprob / 10, eprob % 10,
- mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
- prob / 10, prob % 10,
+ prob_ewmsd / 10, prob_ewmsd % 10,
mrs->retry_count,
mrs->last_success,
mrs->last_attempts,
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 30fbabf4bcbc..8e783e197e93 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -14,6 +14,7 @@
#include <linux/ieee80211.h>
#include <net/mac80211.h>
#include "rate.h"
+#include "sta_info.h"
#include "rc80211_minstrel.h"
#include "rc80211_minstrel_ht.h"
@@ -154,67 +155,47 @@ MODULE_PARM_DESC(minstrel_vht_only,
const struct mcs_group minstrel_mcs_groups[] = {
MCS_GROUP(1, 0, BW_20),
MCS_GROUP(2, 0, BW_20),
-#if MINSTREL_MAX_STREAMS >= 3
MCS_GROUP(3, 0, BW_20),
-#endif
MCS_GROUP(1, 1, BW_20),
MCS_GROUP(2, 1, BW_20),
-#if MINSTREL_MAX_STREAMS >= 3
MCS_GROUP(3, 1, BW_20),
-#endif
MCS_GROUP(1, 0, BW_40),
MCS_GROUP(2, 0, BW_40),
-#if MINSTREL_MAX_STREAMS >= 3
MCS_GROUP(3, 0, BW_40),
-#endif
MCS_GROUP(1, 1, BW_40),
MCS_GROUP(2, 1, BW_40),
-#if MINSTREL_MAX_STREAMS >= 3
MCS_GROUP(3, 1, BW_40),
-#endif
CCK_GROUP,
#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
VHT_GROUP(1, 0, BW_20),
VHT_GROUP(2, 0, BW_20),
-#if MINSTREL_MAX_STREAMS >= 3
VHT_GROUP(3, 0, BW_20),
-#endif
VHT_GROUP(1, 1, BW_20),
VHT_GROUP(2, 1, BW_20),
-#if MINSTREL_MAX_STREAMS >= 3
VHT_GROUP(3, 1, BW_20),
-#endif
VHT_GROUP(1, 0, BW_40),
VHT_GROUP(2, 0, BW_40),
-#if MINSTREL_MAX_STREAMS >= 3
VHT_GROUP(3, 0, BW_40),
-#endif
VHT_GROUP(1, 1, BW_40),
VHT_GROUP(2, 1, BW_40),
-#if MINSTREL_MAX_STREAMS >= 3
VHT_GROUP(3, 1, BW_40),
-#endif
VHT_GROUP(1, 0, BW_80),
VHT_GROUP(2, 0, BW_80),
-#if MINSTREL_MAX_STREAMS >= 3
VHT_GROUP(3, 0, BW_80),
-#endif
VHT_GROUP(1, 1, BW_80),
VHT_GROUP(2, 1, BW_80),
-#if MINSTREL_MAX_STREAMS >= 3
VHT_GROUP(3, 1, BW_80),
#endif
-#endif
};
static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly;
@@ -301,7 +282,7 @@ minstrel_ht_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
break;
/* short preamble */
- if (!(mi->groups[group].supported & BIT(idx)))
+ if (!(mi->supported[group] & BIT(idx)))
idx += 4;
}
return &mi->groups[group].rates[idx];
@@ -486,7 +467,7 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi)
MCS_GROUP_RATES].streams;
for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) {
mg = &mi->groups[group];
- if (!mg->supported || group == MINSTREL_CCK_GROUP)
+ if (!mi->supported[group] || group == MINSTREL_CCK_GROUP)
continue;
tmp_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
@@ -540,7 +521,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) {
mg = &mi->groups[group];
- if (!mg->supported)
+ if (!mi->supported[group])
continue;
mi->sample_count++;
@@ -550,7 +531,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
tmp_group_tp_rate[j] = group;
for (i = 0; i < MCS_GROUP_RATES; i++) {
- if (!(mg->supported & BIT(i)))
+ if (!(mi->supported[group] & BIT(i)))
continue;
index = MCS_GROUP_RATES * group + i;
@@ -636,7 +617,7 @@ minstrel_set_next_sample_idx(struct minstrel_ht_sta *mi)
mi->sample_group %= ARRAY_SIZE(minstrel_mcs_groups);
mg = &mi->groups[mi->sample_group];
- if (!mg->supported)
+ if (!mi->supported[mi->sample_group])
continue;
if (++mg->index >= MCS_GROUP_RATES) {
@@ -657,7 +638,7 @@ minstrel_downgrade_rate(struct minstrel_ht_sta *mi, u16 *idx, bool primary)
while (group > 0) {
group--;
- if (!mi->groups[group].supported)
+ if (!mi->supported[group])
continue;
if (minstrel_mcs_groups[group].streams >
@@ -994,7 +975,7 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
sample_idx = sample_table[mg->column][mg->index];
minstrel_set_next_sample_idx(mi);
- if (!(mg->supported & BIT(sample_idx)))
+ if (!(mi->supported[sample_group] & BIT(sample_idx)))
return -1;
mrs = &mg->rates[sample_idx];
@@ -1049,22 +1030,6 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
}
static void
-minstrel_ht_check_cck_shortpreamble(struct minstrel_priv *mp,
- struct minstrel_ht_sta *mi, bool val)
-{
- u8 supported = mi->groups[MINSTREL_CCK_GROUP].supported;
-
- if (!supported || !mi->cck_supported_short)
- return;
-
- if (supported & (mi->cck_supported_short << (val * 4)))
- return;
-
- supported ^= mi->cck_supported_short | (mi->cck_supported_short << 4);
- mi->groups[MINSTREL_CCK_GROUP].supported = supported;
-}
-
-static void
minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
struct ieee80211_tx_rate_control *txrc)
{
@@ -1087,7 +1052,6 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
minstrel_aggr_check(sta, txrc->skb);
info->flags |= mi->tx_flags;
- minstrel_ht_check_cck_shortpreamble(mp, mi, txrc->short_preamble);
#ifdef CONFIG_MAC80211_DEBUGFS
if (mp->fixed_rate_idx != -1)
@@ -1154,7 +1118,7 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
mi->cck_supported_short |= BIT(i);
}
- mi->groups[MINSTREL_CCK_GROUP].supported = mi->cck_supported;
+ mi->supported[MINSTREL_CCK_GROUP] = mi->cck_supported;
}
static void
@@ -1168,6 +1132,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs;
u16 sta_cap = sta->ht_cap.cap;
struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap;
+ struct sta_info *sinfo = container_of(sta, struct sta_info, sta);
int use_vht;
int n_supported = 0;
int ack_dur;
@@ -1224,7 +1189,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
u32 gflags = minstrel_mcs_groups[i].flags;
int bw, nss;
- mi->groups[i].supported = 0;
+ mi->supported[i] = 0;
if (i == MINSTREL_CCK_GROUP) {
minstrel_ht_update_cck(mp, mi, sband, sta);
continue;
@@ -1256,8 +1221,8 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
if (use_vht && minstrel_vht_only)
continue;
#endif
- mi->groups[i].supported = mcs->rx_mask[nss - 1];
- if (mi->groups[i].supported)
+ mi->supported[i] = mcs->rx_mask[nss - 1];
+ if (mi->supported[i])
n_supported++;
continue;
}
@@ -1283,16 +1248,19 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
else
bw = BW_20;
- mi->groups[i].supported = minstrel_get_valid_vht_rates(bw, nss,
+ mi->supported[i] = minstrel_get_valid_vht_rates(bw, nss,
vht_cap->vht_mcs.tx_mcs_map);
- if (mi->groups[i].supported)
+ if (mi->supported[i])
n_supported++;
}
if (!n_supported)
goto use_legacy;
+ if (test_sta_flag(sinfo, WLAN_STA_SHORT_PREAMBLE))
+ mi->cck_supported_short |= mi->cck_supported_short << 4;
+
/* create an initial rate table with the lowest supported rates */
minstrel_ht_update_stats(mp, mi);
minstrel_ht_update_rates(mp, mi);
diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h
index e8b52a94d24b..de1646c42e82 100644
--- a/net/mac80211/rc80211_minstrel_ht.h
+++ b/net/mac80211/rc80211_minstrel_ht.h
@@ -52,9 +52,6 @@ struct minstrel_mcs_group_data {
u8 index;
u8 column;
- /* bitfield of supported MCS rates of this group */
- u16 supported;
-
/* sorted rate set within a MCS group*/
u16 max_group_tp_rate[MAX_THR_RATES];
u16 max_group_prob_rate;
@@ -101,6 +98,9 @@ struct minstrel_ht_sta {
u8 cck_supported;
u8 cck_supported_short;
+ /* Bitfield of supported MCS rates of all groups */
+ u16 supported[MINSTREL_GROUPS_NB];
+
/* MCS rate group info and statistics */
struct minstrel_mcs_group_data groups[MINSTREL_GROUPS_NB];
};
diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c
index 5320e35ed3d0..7d969e300fb3 100644
--- a/net/mac80211/rc80211_minstrel_ht_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -19,12 +19,12 @@ static char *
minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
{
const struct mcs_group *mg;
- unsigned int j, tp_max, tp_avg, prob, eprob, tx_time;
+ unsigned int j, tp_max, tp_avg, eprob, tx_time;
char htmode = '2';
char gimode = 'L';
u32 gflags;
- if (!mi->groups[i].supported)
+ if (!mi->supported[i])
return p;
mg = &minstrel_mcs_groups[i];
@@ -41,8 +41,9 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j];
static const int bitrates[4] = { 10, 20, 55, 110 };
int idx = i * MCS_GROUP_RATES + j;
+ unsigned int prob_ewmsd;
- if (!(mi->groups[i].supported & BIT(j)))
+ if (!(mi->supported[i] & BIT(j)))
continue;
if (gflags & IEEE80211_TX_RC_MCS) {
@@ -83,17 +84,16 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
- prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+ prob_ewmsd = minstrel_get_ewmsd10(mrs);
p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u"
- " %3u.%1u %3u %3u %-3u "
+ " %3u %3u %-3u "
"%9llu %-9llu\n",
tp_max / 10, tp_max % 10,
tp_avg / 10, tp_avg % 10,
eprob / 10, eprob % 10,
- mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
- prob / 10, prob % 10,
+ prob_ewmsd / 10, prob_ewmsd % 10,
mrs->retry_count,
mrs->last_success,
mrs->last_attempts,
@@ -130,9 +130,9 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file)
p += sprintf(p, "\n");
p += sprintf(p,
- " best ____________rate__________ ________statistics________ ________last_______ ______sum-of________\n");
+ " best ____________rate__________ ________statistics________ _____last____ ______sum-of________\n");
p += sprintf(p,
- "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [prob.|retry|suc|att] [#success | #attempts]\n");
+ "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [retry|suc|att] [#success | #attempts]\n");
p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p);
for (i = 0; i < MINSTREL_CCK_GROUP; i++)
@@ -165,12 +165,12 @@ static char *
minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
{
const struct mcs_group *mg;
- unsigned int j, tp_max, tp_avg, prob, eprob, tx_time;
+ unsigned int j, tp_max, tp_avg, eprob, tx_time;
char htmode = '2';
char gimode = 'L';
u32 gflags;
- if (!mi->groups[i].supported)
+ if (!mi->supported[i])
return p;
mg = &minstrel_mcs_groups[i];
@@ -187,8 +187,9 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j];
static const int bitrates[4] = { 10, 20, 55, 110 };
int idx = i * MCS_GROUP_RATES + j;
+ unsigned int prob_ewmsd;
- if (!(mi->groups[i].supported & BIT(j)))
+ if (!(mi->supported[i] & BIT(j)))
continue;
if (gflags & IEEE80211_TX_RC_MCS) {
@@ -226,16 +227,15 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
- prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+ prob_ewmsd = minstrel_get_ewmsd10(mrs);
- p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,"
+ p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,"
"%u,%llu,%llu,",
tp_max / 10, tp_max % 10,
tp_avg / 10, tp_avg % 10,
eprob / 10, eprob % 10,
- mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
- prob / 10, prob % 10,
+ prob_ewmsd / 10, prob_ewmsd % 10,
mrs->retry_count,
mrs->last_success,
mrs->last_attempts,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index a47bbc973f2d..4d7543d1a62c 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4,7 +4,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
- * Copyright(c) 2015 - 2016 Intel Deutschland GmbH
+ * Copyright(c) 2015 - 2017 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -208,6 +208,51 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
return len;
}
+static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb,
+ int rtap_vendor_space)
+{
+ struct {
+ struct ieee80211_hdr_3addr hdr;
+ u8 category;
+ u8 action_code;
+ } __packed action;
+
+ if (!sdata)
+ return;
+
+ BUILD_BUG_ON(sizeof(action) != IEEE80211_MIN_ACTION_SIZE + 1);
+
+ if (skb->len < rtap_vendor_space + sizeof(action) +
+ VHT_MUMIMO_GROUPS_DATA_LEN)
+ return;
+
+ if (!is_valid_ether_addr(sdata->u.mntr.mu_follow_addr))
+ return;
+
+ skb_copy_bits(skb, rtap_vendor_space, &action, sizeof(action));
+
+ if (!ieee80211_is_action(action.hdr.frame_control))
+ return;
+
+ if (action.category != WLAN_CATEGORY_VHT)
+ return;
+
+ if (action.action_code != WLAN_VHT_ACTION_GROUPID_MGMT)
+ return;
+
+ if (!ether_addr_equal(action.hdr.addr1, sdata->u.mntr.mu_follow_addr))
+ return;
+
+ skb = skb_copy(skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
+ skb_queue_tail(&sdata->skb_queue, skb);
+ ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+}
+
/*
* ieee80211_add_rx_radiotap_header - add radiotap header
*
@@ -515,7 +560,6 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
struct net_device *prev_dev = NULL;
int present_fcs_len = 0;
unsigned int rtap_vendor_space = 0;
- struct ieee80211_mgmt *mgmt;
struct ieee80211_sub_if_data *monitor_sdata =
rcu_dereference(local->monitor_sdata);
@@ -553,6 +597,8 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
return remove_monitor_info(local, origskb, rtap_vendor_space);
}
+ ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_vendor_space);
+
/* room for the radiotap header based on driver features */
rt_hdrlen = ieee80211_rx_radiotap_hdrlen(local, status, origskb);
needed_headroom = rt_hdrlen - rtap_vendor_space;
@@ -618,23 +664,6 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
ieee80211_rx_stats(sdata->dev, skb->len);
}
- mgmt = (void *)skb->data;
- if (monitor_sdata &&
- skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 + VHT_MUMIMO_GROUPS_DATA_LEN &&
- ieee80211_is_action(mgmt->frame_control) &&
- mgmt->u.action.category == WLAN_CATEGORY_VHT &&
- mgmt->u.action.u.vht_group_notif.action_code == WLAN_VHT_ACTION_GROUPID_MGMT &&
- is_valid_ether_addr(monitor_sdata->u.mntr.mu_follow_addr) &&
- ether_addr_equal(mgmt->da, monitor_sdata->u.mntr.mu_follow_addr)) {
- struct sk_buff *mu_skb = skb_copy(skb, GFP_ATOMIC);
-
- if (mu_skb) {
- mu_skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
- skb_queue_tail(&monitor_sdata->skb_queue, mu_skb);
- ieee80211_queue_work(&local->hw, &monitor_sdata->work);
- }
- }
-
if (prev_dev) {
skb->dev = prev_dev;
netif_receive_skb(skb);
@@ -1034,6 +1063,18 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata
buf_size = tid_agg_rx->buf_size;
head_seq_num = tid_agg_rx->head_seq_num;
+ /*
+ * If the current MPDU's SN is smaller than the SSN, it shouldn't
+ * be reordered.
+ */
+ if (unlikely(!tid_agg_rx->started)) {
+ if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) {
+ ret = false;
+ goto out;
+ }
+ tid_agg_rx->started = true;
+ }
+
/* frame with out of date sequence number */
if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) {
dev_kfree_skb(skb);
@@ -1391,16 +1432,18 @@ EXPORT_SYMBOL(ieee80211_sta_pspoll);
void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *pubsta, u8 tid)
{
struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
- u8 ac = ieee802_1d_to_ac[tid & 7];
+ int ac = ieee80211_ac_from_tid(tid);
/*
- * If this AC is not trigger-enabled do nothing.
+ * If this AC is not trigger-enabled do nothing unless the
+ * driver is calling us after it already checked.
*
* NB: This could/should check a separate bitmap of trigger-
* enabled queues, but for now we only implement uAPSD w/o
* TSPEC changes to the ACs, so they're always the same.
*/
- if (!(sta->sta.uapsd_queues & BIT(ac)))
+ if (!(sta->sta.uapsd_queues & ieee80211_ac_to_qos_mask[ac]) &&
+ tid != IEEE80211_NUM_TIDS)
return;
/* if we are in a service period, do nothing */
@@ -1906,7 +1949,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
unsigned int frag, seq;
struct ieee80211_fragment_entry *entry;
struct sk_buff *skb;
- struct ieee80211_rx_status *status;
hdr = (struct ieee80211_hdr *)rx->skb->data;
fc = hdr->frame_control;
@@ -2032,9 +2074,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
dev_kfree_skb(skb);
}
- /* Complete frame has been reassembled - process it now */
- status = IEEE80211_SKB_RXCB(rx->skb);
-
out:
ieee80211_led_rx(rx->local);
out_no_led:
@@ -2215,7 +2254,8 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
!(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
(sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) {
- if (is_multicast_ether_addr(ehdr->h_dest)) {
+ if (is_multicast_ether_addr(ehdr->h_dest) &&
+ ieee80211_vif_get_num_mcast_if(sdata) != 0) {
/*
* send multicast frames both to higher layers in
* local net stack and back to the wireless medium
@@ -2224,7 +2264,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
if (!xmit_skb)
net_info_ratelimited("%s: failed to clone multicast frame\n",
dev->name);
- } else {
+ } else if (!is_multicast_ether_addr(ehdr->h_dest)) {
dsta = sta_info_get(sdata, skb->data);
if (dsta) {
/*
@@ -2469,7 +2509,8 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
if (!ifmsh->mshcfg.dot11MeshForwarding)
goto out;
- fwd_skb = skb_copy(skb, GFP_ATOMIC);
+ fwd_skb = skb_copy_expand(skb, local->tx_headroom +
+ sdata->encrypt_headroom, 0, GFP_ATOMIC);
if (!fwd_skb) {
net_info_ratelimited("%s: failed to clone mesh frame\n",
sdata->name);
@@ -2877,17 +2918,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
switch (mgmt->u.action.u.vht_opmode_notif.action_code) {
case WLAN_VHT_ACTION_OPMODE_NOTIF: {
- u8 opmode;
-
/* verify opmode is present */
if (len < IEEE80211_MIN_ACTION_SIZE + 2)
goto invalid;
-
- opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode;
-
- ieee80211_vht_handle_opmode(rx->sdata, rx->sta,
- opmode, status->band);
- goto handled;
+ goto queue;
}
case WLAN_VHT_ACTION_GROUPID_MGMT: {
if (len < IEEE80211_MIN_ACTION_SIZE + 25)
@@ -3605,6 +3639,27 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
!ether_addr_equal(bssid, hdr->addr1))
return false;
}
+
+ /*
+ * 802.11-2016 Table 9-26 says that for data frames, A1 must be
+ * the BSSID - we've checked that already but may have accepted
+ * the wildcard (ff:ff:ff:ff:ff:ff).
+ *
+ * It also says:
+ * The BSSID of the Data frame is determined as follows:
+ * a) If the STA is contained within an AP or is associated
+ * with an AP, the BSSID is the address currently in use
+ * by the STA contained in the AP.
+ *
+ * So we should not accept data frames with an address that's
+ * multicast.
+ *
+ * Accepting it also opens a security problem because stations
+ * could encrypt it with the GTK and inject traffic that way.
+ */
+ if (ieee80211_is_data(hdr->frame_control) && multicast)
+ return false;
+
return true;
case NL80211_IFTYPE_WDS:
if (bssid || !ieee80211_is_data(hdr->frame_control))
@@ -3887,6 +3942,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
stats->last_rate = sta_stats_encode_rate(status);
stats->fragments++;
+ stats->packets++;
if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
stats->last_signal = status->signal;
@@ -3939,21 +3995,31 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
u64_stats_update_end(&stats->syncp);
if (fast_rx->internal_forward) {
- struct sta_info *dsta = sta_info_get(rx->sdata, skb->data);
+ struct sk_buff *xmit_skb = NULL;
+ bool multicast = is_multicast_ether_addr(skb->data);
- if (dsta) {
+ if (multicast) {
+ xmit_skb = skb_copy(skb, GFP_ATOMIC);
+ } else if (sta_info_get(rx->sdata, skb->data)) {
+ xmit_skb = skb;
+ skb = NULL;
+ }
+
+ if (xmit_skb) {
/*
* Send to wireless media and increase priority by 256
* to keep the received priority instead of
* reclassifying the frame (see cfg80211_classify8021d).
*/
- skb->priority += 256;
- skb->protocol = htons(ETH_P_802_3);
- skb_reset_network_header(skb);
- skb_reset_mac_header(skb);
- dev_queue_xmit(skb);
- return true;
+ xmit_skb->priority += 256;
+ xmit_skb->protocol = htons(ETH_P_802_3);
+ skb_reset_network_header(xmit_skb);
+ skb_reset_mac_header(xmit_skb);
+ dev_queue_xmit(xmit_skb);
}
+
+ if (!skb)
+ return true;
}
/* deliver to local stack */
@@ -4070,15 +4136,17 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
ieee80211_is_beacon(hdr->frame_control)))
ieee80211_scan_rx(local, skb);
- if (pubsta) {
- rx.sta = container_of(pubsta, struct sta_info, sta);
- rx.sdata = rx.sta->sdata;
- if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
- return;
- goto out;
- } else if (ieee80211_is_data(fc)) {
+ if (ieee80211_is_data(fc)) {
struct sta_info *sta, *prev_sta;
+ if (pubsta) {
+ rx.sta = container_of(pubsta, struct sta_info, sta);
+ rx.sdata = rx.sta->sdata;
+ if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
+ return;
+ goto out;
+ }
+
prev_sta = NULL;
for_each_sta_info(local, hdr->addr2, sta, tmp) {
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 23d8ac829279..faab3c490d2b 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -1120,7 +1120,6 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
u32 rate_masks[NUM_NL80211_BANDS] = {};
u8 bands_used = 0;
u8 *ie;
- size_t len;
iebufsz = local->scan_ies_len + req->ie_len;
@@ -1145,10 +1144,9 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
ieee80211_prepare_scan_chandef(&chandef, req->scan_width);
- len = ieee80211_build_preq_ies(local, ie, num_bands * iebufsz,
- &sched_scan_ies, req->ie,
- req->ie_len, bands_used,
- rate_masks, &chandef);
+ ieee80211_build_preq_ies(local, ie, num_bands * iebufsz,
+ &sched_scan_ies, req->ie,
+ req->ie_len, bands_used, rate_masks, &chandef);
ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies);
if (ret == 0) {
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 8e05032689f0..3323a2fb289b 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -513,23 +513,23 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
{
struct ieee80211_local *local = sta->local;
struct ieee80211_sub_if_data *sdata = sta->sdata;
- struct station_info *sinfo;
+ struct station_info *sinfo = NULL;
int err = 0;
lockdep_assert_held(&local->sta_mtx);
- sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL);
- if (!sinfo) {
- err = -ENOMEM;
- goto out_err;
- }
-
/* check if STA exists already */
if (sta_info_get_bss(sdata, sta->sta.addr)) {
err = -EEXIST;
goto out_err;
}
+ sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL);
+ if (!sinfo) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
local->num_sta++;
local->sta_generation++;
smp_mb();
@@ -688,7 +688,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending)
}
/* No need to do anything if the driver does all */
- if (ieee80211_hw_check(&local->hw, AP_LINK_PS))
+ if (ieee80211_hw_check(&local->hw, AP_LINK_PS) && !local->ops->set_tim)
return;
if (sta->dead)
@@ -709,7 +709,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending)
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
unsigned long tids;
- if (ignore_for_tim & BIT(ac))
+ if (ignore_for_tim & ieee80211_ac_to_qos_mask[ac])
continue;
indicate_tim |= !skb_queue_empty(&sta->tx_filtered[ac]) ||
@@ -1264,7 +1264,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
sta_info_recalc_tim(sta);
ps_dbg(sdata,
- "STA %pM aid %d sending %d filtered/%d PS frames since STA not sleeping anymore\n",
+ "STA %pM aid %d sending %d filtered/%d PS frames since STA woke up\n",
sta->sta.addr, sta->sta.aid, filtered, buffered);
ieee80211_check_fast_xmit(sta);
@@ -1389,7 +1389,7 @@ ieee80211_sta_ps_more_data(struct sta_info *sta, u8 ignored_acs,
return true;
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
- if (ignored_acs & BIT(ac))
+ if (ignored_acs & ieee80211_ac_to_qos_mask[ac])
continue;
if (!skb_queue_empty(&sta->tx_filtered[ac]) ||
@@ -1414,7 +1414,7 @@ ieee80211_sta_ps_get_frames(struct sta_info *sta, int n_frames, u8 ignored_acs,
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
unsigned long tids;
- if (ignored_acs & BIT(ac))
+ if (ignored_acs & ieee80211_ac_to_qos_mask[ac])
continue;
tids = ieee80211_tids_for_ac(ac);
@@ -1482,7 +1482,7 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
BIT(find_highest_prio_tid(driver_release_tids));
if (skb_queue_empty(&frames) && !driver_release_tids) {
- int tid;
+ int tid, ac;
/*
* For PS-Poll, this can only happen due to a race condition
@@ -1500,7 +1500,10 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
*/
/* This will evaluate to 1, 3, 5 or 7. */
- tid = 7 - ((ffs(~ignored_acs) - 1) << 1);
+ for (ac = IEEE80211_AC_VO; ac < IEEE80211_NUM_ACS; ac++)
+ if (!(ignored_acs & ieee80211_ac_to_qos_mask[ac]))
+ break;
+ tid = 7 - 2 * ac;
ieee80211_send_null_response(sta, tid, reason, true, false);
} else if (!driver_release_tids) {
@@ -1871,10 +1874,7 @@ int sta_info_move_state(struct sta_info *sta,
if (!sta->sta.support_p2p_ps)
ieee80211_recalc_p2p_go_ps_allowed(sta->sdata);
} else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) {
- if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
- (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
- !sta->sdata->u.vlan.sta))
- atomic_dec(&sta->sdata->bss->num_mcast_sta);
+ ieee80211_vif_dec_num_mcast(sta->sdata);
clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags);
ieee80211_clear_fast_xmit(sta);
ieee80211_clear_fast_rx(sta);
@@ -1882,10 +1882,7 @@ int sta_info_move_state(struct sta_info *sta,
break;
case IEEE80211_STA_AUTHORIZED:
if (sta->sta_state == IEEE80211_STA_ASSOC) {
- if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
- (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
- !sta->sdata->u.vlan.sta))
- atomic_inc(&sta->sdata->bss->num_mcast_sta);
+ ieee80211_vif_inc_num_mcast(sta->sdata);
set_bit(WLAN_STA_AUTHORIZED, &sta->_flags);
ieee80211_check_fast_xmit(sta);
ieee80211_check_fast_rx(sta);
@@ -1975,6 +1972,7 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate,
u16 brate;
unsigned int shift;
+ rinfo->flags = 0;
sband = local->hw.wiphy->bands[(rate >> 4) & 0xf];
brate = sband->bitrates[rate & 0xf].bitrate;
if (rinfo->bw == RATE_INFO_BW_5)
@@ -1990,14 +1988,15 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate,
rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI;
}
-static void sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo)
+static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo)
{
u16 rate = ACCESS_ONCE(sta_get_last_rx_stats(sta)->last_rate);
if (rate == STA_STATS_RATE_INVALID)
- rinfo->flags = 0;
- else
- sta_stats_decode_rate(sta->local, rate, rinfo);
+ return -EINVAL;
+
+ sta_stats_decode_rate(sta->local, rate, rinfo);
+ return 0;
}
static void sta_set_tidstats(struct sta_info *sta,
@@ -2052,16 +2051,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
{
struct ieee80211_sub_if_data *sdata = sta->sdata;
struct ieee80211_local *local = sdata->local;
- struct rate_control_ref *ref = NULL;
u32 thr = 0;
int i, ac, cpu;
struct ieee80211_sta_rx_stats *last_rxstats;
last_rxstats = sta_get_last_rx_stats(sta);
- if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL))
- ref = local->rate_ctrl;
-
sinfo->generation = sdata->local->sta_generation;
/* do before driver, so beacon filtering drivers have a
@@ -2202,8 +2197,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
}
if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE))) {
- sta_set_rate_info_rx(sta, &sinfo->rxrate);
- sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE);
+ if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0)
+ sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE);
}
sinfo->filled |= BIT(NL80211_STA_INFO_TID_STATS);
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index ed5fcb984a01..e65cda34d2bc 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -184,12 +184,12 @@ struct tid_ampdu_tx {
* @ssn: Starting Sequence Number expected to be aggregated.
* @buf_size: buffer size for incoming A-MPDUs
* @timeout: reset timer value (in TUs).
- * @dialog_token: dialog token for aggregation session
* @rcu_head: RCU head used for freeing this struct
* @reorder_lock: serializes access to reorder buffer, see below.
* @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and
* and ssn.
* @removed: this session is removed (but might have been found due to RCU)
+ * @started: this session has started (head ssn or higher was received)
*
* This structure's lifetime is managed by RCU, assignments to
* the array holding it must hold the aggregation mutex.
@@ -213,9 +213,9 @@ struct tid_ampdu_rx {
u16 ssn;
u16 buf_size;
u16 timeout;
- u8 dialog_token;
- bool auto_seq;
- bool removed;
+ u8 auto_seq:1,
+ removed:1,
+ started:1;
};
/**
@@ -225,6 +225,7 @@ struct tid_ampdu_rx {
* to tid_tx[idx], which are protected by the sta spinlock)
* tid_start_tx is also protected by sta->lock.
* @tid_rx: aggregation info for Rx per TID -- RCU protected
+ * @tid_rx_token: dialog tokens for valid aggregation sessions
* @tid_rx_timer_expired: bitmap indicating on which TIDs the
* RX timer expired until the work for it runs
* @tid_rx_stop_requested: bitmap indicating which BA sessions per TID the
@@ -243,6 +244,7 @@ struct sta_ampdu_mlme {
struct mutex mtx;
/* rx */
struct tid_ampdu_rx __rcu *tid_rx[IEEE80211_NUM_TIDS];
+ u8 tid_rx_token[IEEE80211_NUM_TIDS];
unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
@@ -370,7 +372,7 @@ struct mesh_sta {
unsigned int fail_avg;
};
-DECLARE_EWMA(signal, 1024, 8)
+DECLARE_EWMA(signal, 10, 8)
struct ieee80211_sta_rx_stats {
unsigned long packets;
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index ddf71c648cab..83b8b11f24ea 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -51,7 +51,8 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
struct ieee80211_hdr *hdr = (void *)skb->data;
int ac;
- if (info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER) {
+ if (info->flags & (IEEE80211_TX_CTL_NO_PS_BUFFER |
+ IEEE80211_TX_CTL_AMPDU)) {
ieee80211_free_txskb(&local->hw, skb);
return;
}
@@ -95,7 +96,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
*/
if (*p & IEEE80211_QOS_CTL_EOSP)
*p &= ~IEEE80211_QOS_CTL_EOSP;
- ac = ieee802_1d_to_ac[tid & 7];
+ ac = ieee80211_ac_from_tid(tid);
} else {
ac = IEEE80211_AC_BE;
}
@@ -462,9 +463,7 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
unsigned long flags;
spin_lock_irqsave(&local->ack_status_lock, flags);
- skb = idr_find(&local->ack_status_frames, info->ack_frame_id);
- if (skb)
- idr_remove(&local->ack_status_frames, info->ack_frame_id);
+ skb = idr_remove(&local->ack_status_frames, info->ack_frame_id);
spin_unlock_irqrestore(&local->ack_status_lock, flags);
if (!skb)
@@ -541,6 +540,11 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
} else if (info->ack_frame_id) {
ieee80211_report_ack_skb(local, info, acked, dropped);
}
+
+ if (!dropped && skb->destructor) {
+ skb->wifi_acked_valid = 1;
+ skb->wifi_acked = acked;
+ }
}
/*
@@ -633,10 +637,9 @@ void ieee80211_tx_status_noskb(struct ieee80211_hw *hw,
struct ieee80211_local *local = hw_to_local(hw);
struct ieee80211_supported_band *sband;
int retry_count;
- int rates_idx;
bool acked, noack_success;
- rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
+ ieee80211_tx_get_rates(hw, info, &retry_count);
sband = hw->wiphy->bands[info->band];
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 92a47afaa989..0d645bc148d0 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1736,21 +1736,21 @@ TRACE_EVENT(drv_start_nan,
LOCAL_ENTRY
VIF_ENTRY
__field(u8, master_pref)
- __field(u8, dual)
+ __field(u8, bands)
),
TP_fast_assign(
LOCAL_ASSIGN;
VIF_ASSIGN;
__entry->master_pref = conf->master_pref;
- __entry->dual = conf->dual;
+ __entry->bands = conf->bands;
),
TP_printk(
LOCAL_PR_FMT VIF_PR_FMT
- ", master preference: %u, dual: %d",
+ ", master preference: %u, bands: 0x%0x",
LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
- __entry->dual
+ __entry->bands
)
);
@@ -1787,7 +1787,7 @@ TRACE_EVENT(drv_nan_change_conf,
LOCAL_ENTRY
VIF_ENTRY
__field(u8, master_pref)
- __field(u8, dual)
+ __field(u8, bands)
__field(u32, changes)
),
@@ -1795,15 +1795,15 @@ TRACE_EVENT(drv_nan_change_conf,
LOCAL_ASSIGN;
VIF_ASSIGN;
__entry->master_pref = conf->master_pref;
- __entry->dual = conf->dual;
+ __entry->bands = conf->bands;
__entry->changes = changes;
),
TP_printk(
LOCAL_PR_FMT VIF_PR_FMT
- ", master preference: %u, dual: %d, changes: 0x%x",
+ ", master preference: %u, bands: 0x%0x, changes: 0x%x",
LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
- __entry->dual, __entry->changes
+ __entry->bands, __entry->changes
)
);
@@ -1996,23 +1996,26 @@ TRACE_EVENT(api_connection_loss,
TRACE_EVENT(api_cqm_rssi_notify,
TP_PROTO(struct ieee80211_sub_if_data *sdata,
- enum nl80211_cqm_rssi_threshold_event rssi_event),
+ enum nl80211_cqm_rssi_threshold_event rssi_event,
+ s32 rssi_level),
- TP_ARGS(sdata, rssi_event),
+ TP_ARGS(sdata, rssi_event, rssi_level),
TP_STRUCT__entry(
VIF_ENTRY
__field(u32, rssi_event)
+ __field(s32, rssi_level)
),
TP_fast_assign(
VIF_ASSIGN;
__entry->rssi_event = rssi_event;
+ __entry->rssi_level = rssi_level;
),
TP_printk(
- VIF_PR_FMT " event:%d",
- VIF_PR_ARG, __entry->rssi_event
+ VIF_PR_FMT " event:%d rssi:%d",
+ VIF_PR_ARG, __entry->rssi_event, __entry->rssi_level
)
);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index bd5f4be89435..ba8d7db0a071 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -16,6 +16,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
#include <linux/etherdevice.h>
#include <linux/bitmap.h>
#include <linux/rcupdate.h>
@@ -63,6 +64,10 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
struct ieee80211_chanctx_conf *chanctx_conf;
u32 rate_flags = 0;
+ /* assume HW handles this */
+ if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))
+ return 0;
+
rcu_read_lock();
chanctx_conf = rcu_dereference(tx->sdata->vif.chanctx_conf);
if (chanctx_conf) {
@@ -71,10 +76,6 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
}
rcu_read_unlock();
- /* assume HW handles this */
- if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))
- return 0;
-
/* uh huh? */
if (WARN_ON_ONCE(tx->rate.idx < 0))
return 0;
@@ -331,9 +332,8 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
I802_DEBUG_INC(tx->local->tx_handlers_drop_not_assoc);
return TX_DROP;
}
- } else if (unlikely(tx->sdata->vif.type == NL80211_IFTYPE_AP &&
- ieee80211_is_data(hdr->frame_control) &&
- !atomic_read(&tx->sdata->u.ap.num_mcast_sta))) {
+ } else if (unlikely(ieee80211_is_data(hdr->frame_control) &&
+ ieee80211_vif_get_num_mcast_if(tx->sdata) == 0)) {
/*
* No associated STAs - no need to send multicast
* frames.
@@ -935,7 +935,7 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
if (info->flags & IEEE80211_TX_CTL_DONTFRAG)
return TX_CONTINUE;
- if (tx->local->ops->set_frag_threshold)
+ if (ieee80211_hw_check(&tx->local->hw, SUPPORTS_TX_FRAG))
return TX_CONTINUE;
/*
@@ -1244,7 +1244,7 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
struct ieee80211_vif *vif,
- struct ieee80211_sta *pubsta,
+ struct sta_info *sta,
struct sk_buff *skb)
{
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
@@ -1258,10 +1258,13 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
if (!ieee80211_is_data(hdr->frame_control))
return NULL;
- if (pubsta) {
+ if (sta) {
u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
- txq = pubsta->txq[tid];
+ if (!sta->uploaded)
+ return NULL;
+
+ txq = sta->sta.txq[tid];
} else if (vif) {
txq = vif->txq;
}
@@ -1411,7 +1414,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
txqi->txq.sta = &sta->sta;
sta->sta.txq[tid] = &txqi->txq;
txqi->txq.tid = tid;
- txqi->txq.ac = ieee802_1d_to_ac[tid & 7];
+ txqi->txq.ac = ieee80211_ac_from_tid(tid);
} else {
sdata->vif.txq = &txqi->txq;
txqi->txq.tid = 0;
@@ -1504,23 +1507,17 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local,
struct fq *fq = &local->fq;
struct ieee80211_vif *vif;
struct txq_info *txqi;
- struct ieee80211_sta *pubsta;
if (!local->ops->wake_tx_queue ||
sdata->vif.type == NL80211_IFTYPE_MONITOR)
return false;
- if (sta && sta->uploaded)
- pubsta = &sta->sta;
- else
- pubsta = NULL;
-
if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
sdata = container_of(sdata->bss,
struct ieee80211_sub_if_data, u.ap);
vif = &sdata->vif;
- txqi = ieee80211_get_txq(local, vif, pubsta, skb);
+ txqi = ieee80211_get_txq(local, vif, sta, skb);
if (!txqi)
return false;
@@ -2798,7 +2795,7 @@ void ieee80211_check_fast_xmit(struct sta_info *sta)
/* fast-xmit doesn't handle fragmentation at all */
if (local->hw.wiphy->frag_threshold != (u32)-1 &&
- !local->ops->set_frag_threshold)
+ !ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG))
goto out;
rcu_read_lock();
@@ -3057,11 +3054,12 @@ static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
struct ieee80211_local *local = sdata->local;
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
struct ieee80211_hdr *hdr;
- struct ethhdr amsdu_hdr;
+ struct ethhdr *amsdu_hdr;
int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header);
int subframe_len = skb->len - hdr_len;
void *data;
- u8 *qc;
+ u8 *qc, *h_80211_src, *h_80211_dst;
+ const u8 *bssid;
if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE)
return false;
@@ -3069,19 +3067,44 @@ static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
if (info->control.flags & IEEE80211_TX_CTRL_AMSDU)
return true;
- if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(amsdu_hdr),
+ if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(*amsdu_hdr),
&subframe_len))
return false;
- amsdu_hdr.h_proto = cpu_to_be16(subframe_len);
- memcpy(amsdu_hdr.h_source, skb->data + fast_tx->sa_offs, ETH_ALEN);
- memcpy(amsdu_hdr.h_dest, skb->data + fast_tx->da_offs, ETH_ALEN);
+ data = skb_push(skb, sizeof(*amsdu_hdr));
+ memmove(data, data + sizeof(*amsdu_hdr), hdr_len);
+ hdr = data;
+ amsdu_hdr = data + hdr_len;
+ /* h_80211_src/dst is addr* field within hdr */
+ h_80211_src = data + fast_tx->sa_offs;
+ h_80211_dst = data + fast_tx->da_offs;
+
+ amsdu_hdr->h_proto = cpu_to_be16(subframe_len);
+ ether_addr_copy(amsdu_hdr->h_source, h_80211_src);
+ ether_addr_copy(amsdu_hdr->h_dest, h_80211_dst);
+
+ /* according to IEEE 802.11-2012 8.3.2 table 8-19, the outer SA/DA
+ * fields needs to be changed to BSSID for A-MSDU frames depending
+ * on FromDS/ToDS values.
+ */
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_STATION:
+ bssid = sdata->u.mgd.bssid;
+ break;
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ bssid = sdata->vif.addr;
+ break;
+ default:
+ bssid = NULL;
+ }
+
+ if (bssid && ieee80211_has_fromds(hdr->frame_control))
+ ether_addr_copy(h_80211_src, bssid);
- data = skb_push(skb, sizeof(amsdu_hdr));
- memmove(data, data + sizeof(amsdu_hdr), hdr_len);
- memcpy(data + hdr_len, &amsdu_hdr, sizeof(amsdu_hdr));
+ if (bssid && ieee80211_has_tods(hdr->frame_control))
+ ether_addr_copy(h_80211_dst, bssid);
- hdr = data;
qc = ieee80211_get_qos_ctl(hdr);
*qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
@@ -3262,7 +3285,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2);
int hw_headroom = sdata->local->hw.extra_tx_headroom;
struct ethhdr eth;
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_tx_info *info;
struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
struct ieee80211_tx_data tx;
ieee80211_tx_result r;
@@ -3326,6 +3349,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
memcpy(skb->data + fast_tx->da_offs, eth.h_dest, ETH_ALEN);
memcpy(skb->data + fast_tx->sa_offs, eth.h_source, ETH_ALEN);
+ info = IEEE80211_SKB_CB(skb);
memset(info, 0, sizeof(*info));
info->band = fast_tx->band;
info->control.vif = &sdata->vif;
@@ -3548,6 +3572,115 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
rcu_read_unlock();
}
+static int ieee80211_change_da(struct sk_buff *skb, struct sta_info *sta)
+{
+ struct ethhdr *eth;
+ int err;
+
+ err = skb_ensure_writable(skb, ETH_HLEN);
+ if (unlikely(err))
+ return err;
+
+ eth = (void *)skb->data;
+ ether_addr_copy(eth->h_dest, sta->sta.addr);
+
+ return 0;
+}
+
+static bool ieee80211_multicast_to_unicast(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ const struct ethhdr *eth = (void *)skb->data;
+ const struct vlan_ethhdr *ethvlan = (void *)skb->data;
+ __be16 ethertype;
+
+ if (likely(!is_multicast_ether_addr(eth->h_dest)))
+ return false;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP_VLAN:
+ if (sdata->u.vlan.sta)
+ return false;
+ if (sdata->wdev.use_4addr)
+ return false;
+ /* fall through */
+ case NL80211_IFTYPE_AP:
+ /* check runtime toggle for this bss */
+ if (!sdata->bss->multicast_to_unicast)
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ /* multicast to unicast conversion only for some payload */
+ ethertype = eth->h_proto;
+ if (ethertype == htons(ETH_P_8021Q) && skb->len >= VLAN_ETH_HLEN)
+ ethertype = ethvlan->h_vlan_encapsulated_proto;
+ switch (ethertype) {
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_IP):
+ case htons(ETH_P_IPV6):
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static void
+ieee80211_convert_to_unicast(struct sk_buff *skb, struct net_device *dev,
+ struct sk_buff_head *queue)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ const struct ethhdr *eth = (struct ethhdr *)skb->data;
+ struct sta_info *sta, *first = NULL;
+ struct sk_buff *cloned_skb;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ if (sdata != sta->sdata)
+ /* AP-VLAN mismatch */
+ continue;
+ if (unlikely(ether_addr_equal(eth->h_source, sta->sta.addr)))
+ /* do not send back to source */
+ continue;
+ if (!first) {
+ first = sta;
+ continue;
+ }
+ cloned_skb = skb_clone(skb, GFP_ATOMIC);
+ if (!cloned_skb)
+ goto multicast;
+ if (unlikely(ieee80211_change_da(cloned_skb, sta))) {
+ dev_kfree_skb(cloned_skb);
+ goto multicast;
+ }
+ __skb_queue_tail(queue, cloned_skb);
+ }
+
+ if (likely(first)) {
+ if (unlikely(ieee80211_change_da(skb, first)))
+ goto multicast;
+ __skb_queue_tail(queue, skb);
+ } else {
+ /* no STA connected, drop */
+ kfree_skb(skb);
+ skb = NULL;
+ }
+
+ goto out;
+multicast:
+ __skb_queue_purge(queue);
+ __skb_queue_tail(queue, skb);
+out:
+ rcu_read_unlock();
+}
+
/**
* ieee80211_subif_start_xmit - netif start_xmit function for 802.3 vifs
* @skb: packet to be sent
@@ -3558,7 +3691,17 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
struct net_device *dev)
{
- __ieee80211_subif_start_xmit(skb, dev, 0);
+ if (unlikely(ieee80211_multicast_to_unicast(skb, dev))) {
+ struct sk_buff_head queue;
+
+ __skb_queue_head_init(&queue);
+ ieee80211_convert_to_unicast(skb, dev, &queue);
+ while ((skb = __skb_dequeue(&queue)))
+ __ieee80211_subif_start_xmit(skb, dev, 0);
+ } else {
+ __ieee80211_subif_start_xmit(skb, dev, 0);
+ }
+
return NETDEV_TX_OK;
}
@@ -4051,7 +4194,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
}
if (ifmsh->sync_ops)
- ifmsh->sync_ops->adjust_tbtt(sdata, beacon);
+ ifmsh->sync_ops->adjust_tsf(sdata, beacon);
skb = dev_alloc_skb(local->tx_headroom +
beacon->head_len +
@@ -4516,7 +4659,7 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, int tid,
enum nl80211_band band)
{
- int ac = ieee802_1d_to_ac[tid & 7];
+ int ac = ieee80211_ac_from_tid(tid);
skb_reset_mac_header(skb);
skb_set_queue_mapping(skb, ac);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 545c79a42a77..ac59fbd280df 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -3308,10 +3308,11 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
struct ieee80211_local *local = sdata->local;
struct ieee80211_sub_if_data *sdata_iter;
enum nl80211_iftype iftype = sdata->wdev.iftype;
- int num[NUM_NL80211_IFTYPES];
struct ieee80211_chanctx *ctx;
- int num_different_channels = 0;
int total = 1;
+ struct iface_combination_params params = {
+ .radar_detect = radar_detect,
+ };
lockdep_assert_held(&local->chanctx_mtx);
@@ -3322,12 +3323,19 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
!chandef->chan))
return -EINVAL;
- if (chandef)
- num_different_channels = 1;
-
if (WARN_ON(iftype >= NUM_NL80211_IFTYPES))
return -EINVAL;
+ if (sdata->vif.type == NL80211_IFTYPE_AP ||
+ sdata->vif.type == NL80211_IFTYPE_MESH_POINT) {
+ /*
+ * always passing this is harmless, since it'll be the
+ * same value that cfg80211 finds if it finds the same
+ * interface ... and that's always allowed
+ */
+ params.new_beacon_int = sdata->vif.bss_conf.beacon_int;
+ }
+
/* Always allow software iftypes */
if (local->hw.wiphy->software_iftypes & BIT(iftype)) {
if (radar_detect)
@@ -3335,24 +3343,26 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
return 0;
}
- memset(num, 0, sizeof(num));
+ if (chandef)
+ params.num_different_channels = 1;
if (iftype != NL80211_IFTYPE_UNSPECIFIED)
- num[iftype] = 1;
+ params.iftype_num[iftype] = 1;
list_for_each_entry(ctx, &local->chanctx_list, list) {
if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)
continue;
- radar_detect |= ieee80211_chanctx_radar_detect(local, ctx);
+ params.radar_detect |=
+ ieee80211_chanctx_radar_detect(local, ctx);
if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) {
- num_different_channels++;
+ params.num_different_channels++;
continue;
}
if (chandef && chanmode == IEEE80211_CHANCTX_SHARED &&
cfg80211_chandef_compatible(chandef,
&ctx->conf.def))
continue;
- num_different_channels++;
+ params.num_different_channels++;
}
list_for_each_entry_rcu(sdata_iter, &local->interfaces, list) {
@@ -3365,16 +3375,14 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
local->hw.wiphy->software_iftypes & BIT(wdev_iter->iftype))
continue;
- num[wdev_iter->iftype]++;
+ params.iftype_num[wdev_iter->iftype]++;
total++;
}
- if (total == 1 && !radar_detect)
+ if (total == 1 && !params.radar_detect)
return 0;
- return cfg80211_check_combinations(local->hw.wiphy,
- num_different_channels,
- radar_detect, num);
+ return cfg80211_check_combinations(local->hw.wiphy, &params);
}
static void
@@ -3390,12 +3398,10 @@ ieee80211_iter_max_chans(const struct ieee80211_iface_combination *c,
int ieee80211_max_num_channels(struct ieee80211_local *local)
{
struct ieee80211_sub_if_data *sdata;
- int num[NUM_NL80211_IFTYPES] = {};
struct ieee80211_chanctx *ctx;
- int num_different_channels = 0;
- u8 radar_detect = 0;
u32 max_num_different_channels = 1;
int err;
+ struct iface_combination_params params = {0};
lockdep_assert_held(&local->chanctx_mtx);
@@ -3403,17 +3409,17 @@ int ieee80211_max_num_channels(struct ieee80211_local *local)
if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)
continue;
- num_different_channels++;
+ params.num_different_channels++;
- radar_detect |= ieee80211_chanctx_radar_detect(local, ctx);
+ params.radar_detect |=
+ ieee80211_chanctx_radar_detect(local, ctx);
}
list_for_each_entry_rcu(sdata, &local->interfaces, list)
- num[sdata->wdev.iftype]++;
+ params.iftype_num[sdata->wdev.iftype]++;
- err = cfg80211_iter_combinations(local->hw.wiphy,
- num_different_channels, radar_detect,
- num, ieee80211_iter_max_chans,
+ err = cfg80211_iter_combinations(local->hw.wiphy, &params,
+ ieee80211_iter_max_chans,
&max_num_different_channels);
if (err < 0)
return err;
@@ -3456,3 +3462,10 @@ void ieee80211_txq_get_depth(struct ieee80211_txq *txq,
*byte_cnt = txqi->tin.backlog_bytes + frag_bytes;
}
EXPORT_SYMBOL(ieee80211_txq_get_depth);
+
+const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS] = {
+ IEEE80211_WMM_IE_STA_QOSINFO_AC_VO,
+ IEEE80211_WMM_IE_STA_QOSINFO_AC_VI,
+ IEEE80211_WMM_IE_STA_QOSINFO_AC_BE,
+ IEEE80211_WMM_IE_STA_QOSINFO_AC_BK
+};
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 6832bf6ab69f..19ec2189d3ac 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -436,14 +436,10 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, u8 opmode,
enum nl80211_band band)
{
- struct ieee80211_local *local = sdata->local;
- struct ieee80211_supported_band *sband;
enum ieee80211_sta_rx_bandwidth new_bw;
u32 changed = 0;
u8 nss;
- sband = local->hw.wiphy->bands[band];
-
/* ignore - no support for BF yet */
if (opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)
return 0;
@@ -527,8 +523,10 @@ void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode, band);
- if (changed > 0)
+ if (changed > 0) {
+ ieee80211_recalc_min_chandef(sdata);
rate_control_rate_update(local, sband, sta, changed);
+ }
}
void ieee80211_get_vht_mask_from_cap(__le16 vht_cap,
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index efa3f48f1ec5..73e8f347802e 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -293,7 +293,8 @@ ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx)
return RX_DROP_UNUSABLE;
ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key);
/* remove ICV */
- if (pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN))
+ if (!(status->flag & RX_FLAG_ICV_STRIPPED) &&
+ pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN))
return RX_DROP_UNUSABLE;
}
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index 9eb0aee9105b..3e3d3014e9ab 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -236,26 +236,35 @@ void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata,
{
struct ieee80211_hdr *hdr = (void *)skb->data;
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+ u8 flags;
u8 *p;
- u8 ack_policy, tid;
if (!ieee80211_is_data_qos(hdr->frame_control))
return;
p = ieee80211_get_qos_ctl(hdr);
- tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
- /* preserve EOSP bit */
- ack_policy = *p & IEEE80211_QOS_CTL_EOSP;
+ /* set up the first byte */
+
+ /*
+ * preserve everything but the TID and ACK policy
+ * (which we both write here)
+ */
+ flags = *p & ~(IEEE80211_QOS_CTL_TID_MASK |
+ IEEE80211_QOS_CTL_ACK_POLICY_MASK);
if (is_multicast_ether_addr(hdr->addr1) ||
sdata->noack_map & BIT(tid)) {
- ack_policy |= IEEE80211_QOS_CTL_ACK_POLICY_NOACK;
+ flags |= IEEE80211_QOS_CTL_ACK_POLICY_NOACK;
info->flags |= IEEE80211_TX_CTL_NO_ACK;
}
- /* qos header is 2 bytes */
- *p++ = ack_policy | tid;
+ *p = flags | tid;
+
+ /* set up the second byte */
+ p++;
+
if (ieee80211_vif_is_mesh(&sdata->vif)) {
/* preserve RSPI and Mesh PS Level bit */
*p &= ((IEEE80211_QOS_CTL_RSPI |
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 42ce9bd4426f..c1ef22df865f 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -57,7 +57,7 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
if (info->control.hw_key &&
(info->flags & IEEE80211_TX_CTL_DONTFRAG ||
- tx->local->ops->set_frag_threshold) &&
+ ieee80211_hw_check(&tx->local->hw, SUPPORTS_TX_FRAG)) &&
!(tx->key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) {
/* hwaccel - with no need for SW-generated MMIC */
return TX_CONTINUE;
@@ -294,7 +294,8 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx)
return RX_DROP_UNUSABLE;
/* Trim ICV */
- skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN);
+ if (!(status->flag & RX_FLAG_ICV_STRIPPED))
+ skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN);
/* Remove IV */
memmove(skb->data + IEEE80211_TKIP_IV_LEN, skb->data, hdrlen);
diff --git a/net/mac802154/Makefile b/net/mac802154/Makefile
index 17a51e8389e2..5857bb1e1695 100644
--- a/net/mac802154/Makefile
+++ b/net/mac802154/Makefile
@@ -3,5 +3,3 @@ mac802154-objs := main.o rx.o tx.o mac_cmd.o mib.o \
iface.o llsec.o util.o cfg.o trace.o
CFLAGS_trace.o := -I$(src)
-
-ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c
index 6a3e1c2181d3..1e1c9b20bab7 100644
--- a/net/mac802154/llsec.c
+++ b/net/mac802154/llsec.c
@@ -18,6 +18,8 @@
#include <linux/bug.h>
#include <linux/completion.h>
#include <linux/ieee802154.h>
+#include <linux/rculist.h>
+
#include <crypto/aead.h>
#include <crypto/skcipher.h>
diff --git a/net/mac802154/util.c b/net/mac802154/util.c
index f9fd0957ab67..7c03fb0ea34c 100644
--- a/net/mac802154/util.c
+++ b/net/mac802154/util.c
@@ -80,11 +80,11 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb,
if (skb->len > max_sifs_size)
hrtimer_start(&local->ifs_timer,
- ktime_set(0, hw->phy->lifs_period * NSEC_PER_USEC),
+ hw->phy->lifs_period * NSEC_PER_USEC,
HRTIMER_MODE_REL);
else
hrtimer_start(&local->ifs_timer,
- ktime_set(0, hw->phy->sifs_period * NSEC_PER_USEC),
+ hw->phy->sifs_period * NSEC_PER_USEC,
HRTIMER_MODE_REL);
} else {
ieee802154_wake_queue(hw);
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 15fe97644ffe..6414079aa729 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -7,7 +7,9 @@
#include <linux/if_arp.h>
#include <linux/ipv6.h>
#include <linux/mpls.h>
+#include <linux/netconf.h>
#include <linux/vmalloc.h>
+#include <linux/percpu.h>
#include <net/ip.h>
#include <net/dst.h>
#include <net/sock.h>
@@ -17,8 +19,8 @@
#include <net/netns/generic.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
-#include <net/addrconf.h>
#endif
+#include <net/addrconf.h>
#include <net/nexthop.h>
#include "internal.h"
@@ -48,11 +50,6 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index)
return rt;
}
-static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
-{
- return rcu_dereference_rtnl(dev->mpls_ptr);
-}
-
bool mpls_output_possible(const struct net_device *dev)
{
return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
@@ -98,18 +95,44 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
}
EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
-static u32 mpls_multipath_hash(struct mpls_route *rt,
- struct sk_buff *skb, bool bos)
+void mpls_stats_inc_outucastpkts(struct net_device *dev,
+ const struct sk_buff *skb)
+{
+ struct mpls_dev *mdev;
+
+ if (skb->protocol == htons(ETH_P_MPLS_UC)) {
+ mdev = mpls_dev_get(dev);
+ if (mdev)
+ MPLS_INC_STATS_LEN(mdev, skb->len,
+ tx_packets,
+ tx_bytes);
+ } else if (skb->protocol == htons(ETH_P_IP)) {
+ IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct inet6_dev *in6dev = __in6_dev_get(dev);
+
+ if (in6dev)
+ IP6_UPD_PO_STATS(dev_net(dev), in6dev,
+ IPSTATS_MIB_OUT, skb->len);
+#endif
+ }
+}
+EXPORT_SYMBOL_GPL(mpls_stats_inc_outucastpkts);
+
+static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb)
{
struct mpls_entry_decoded dec;
+ unsigned int mpls_hdr_len = 0;
struct mpls_shim_hdr *hdr;
bool eli_seen = false;
int label_index;
u32 hash = 0;
- for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos;
+ for (label_index = 0; label_index < MAX_MP_SELECT_LABELS;
label_index++) {
- if (!pskb_may_pull(skb, sizeof(*hdr) * label_index))
+ mpls_hdr_len += sizeof(*hdr);
+ if (!pskb_may_pull(skb, mpls_hdr_len))
break;
/* Read and decode the current label */
@@ -134,37 +157,38 @@ static u32 mpls_multipath_hash(struct mpls_route *rt,
eli_seen = true;
}
- bos = dec.bos;
- if (bos && pskb_may_pull(skb, sizeof(*hdr) * label_index +
- sizeof(struct iphdr))) {
+ if (!dec.bos)
+ continue;
+
+ /* found bottom label; does skb have room for a header? */
+ if (pskb_may_pull(skb, mpls_hdr_len + sizeof(struct iphdr))) {
const struct iphdr *v4hdr;
- v4hdr = (const struct iphdr *)(mpls_hdr(skb) +
- label_index);
+ v4hdr = (const struct iphdr *)(hdr + 1);
if (v4hdr->version == 4) {
hash = jhash_3words(ntohl(v4hdr->saddr),
ntohl(v4hdr->daddr),
v4hdr->protocol, hash);
} else if (v4hdr->version == 6 &&
- pskb_may_pull(skb, sizeof(*hdr) * label_index +
- sizeof(struct ipv6hdr))) {
+ pskb_may_pull(skb, mpls_hdr_len +
+ sizeof(struct ipv6hdr))) {
const struct ipv6hdr *v6hdr;
- v6hdr = (const struct ipv6hdr *)(mpls_hdr(skb) +
- label_index);
-
+ v6hdr = (const struct ipv6hdr *)(hdr + 1);
hash = __ipv6_addr_jhash(&v6hdr->saddr, hash);
hash = __ipv6_addr_jhash(&v6hdr->daddr, hash);
hash = jhash_1word(v6hdr->nexthdr, hash);
}
}
+
+ break;
}
return hash;
}
static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
- struct sk_buff *skb, bool bos)
+ struct sk_buff *skb)
{
int alive = ACCESS_ONCE(rt->rt_nhn_alive);
u32 hash = 0;
@@ -180,7 +204,7 @@ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
if (alive <= 0)
return NULL;
- hash = mpls_multipath_hash(rt, skb, bos);
+ hash = mpls_multipath_hash(rt, skb);
nh_index = hash % alive;
if (alive == rt->rt_nhn)
goto out;
@@ -253,6 +277,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
struct mpls_nh *nh;
struct mpls_entry_decoded dec;
struct net_device *out_dev;
+ struct mpls_dev *out_mdev;
struct mpls_dev *mdev;
unsigned int hh_len;
unsigned int new_header_size;
@@ -262,56 +287,66 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
/* Careful this entire function runs inside of an rcu critical section */
mdev = mpls_dev_get(dev);
- if (!mdev || !mdev->input_enabled)
+ if (!mdev)
goto drop;
- if (skb->pkt_type != PACKET_HOST)
+ MPLS_INC_STATS_LEN(mdev, skb->len, rx_packets,
+ rx_bytes);
+
+ if (!mdev->input_enabled) {
+ MPLS_INC_STATS(mdev, rx_dropped);
goto drop;
+ }
+
+ if (skb->pkt_type != PACKET_HOST)
+ goto err;
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
- goto drop;
+ goto err;
if (!pskb_may_pull(skb, sizeof(*hdr)))
- goto drop;
+ goto err;
/* Read and decode the label */
hdr = mpls_hdr(skb);
dec = mpls_entry_decode(hdr);
- /* Pop the label */
- skb_pull(skb, sizeof(*hdr));
- skb_reset_network_header(skb);
-
- skb_orphan(skb);
-
rt = mpls_route_input_rcu(net, dec.label);
- if (!rt)
+ if (!rt) {
+ MPLS_INC_STATS(mdev, rx_noroute);
goto drop;
+ }
- nh = mpls_select_multipath(rt, skb, dec.bos);
+ nh = mpls_select_multipath(rt, skb);
if (!nh)
- goto drop;
+ goto err;
- /* Find the output device */
- out_dev = rcu_dereference(nh->nh_dev);
- if (!mpls_output_possible(out_dev))
- goto drop;
+ /* Pop the label */
+ skb_pull(skb, sizeof(*hdr));
+ skb_reset_network_header(skb);
+
+ skb_orphan(skb);
if (skb_warn_if_lro(skb))
- goto drop;
+ goto err;
skb_forward_csum(skb);
/* Verify ttl is valid */
if (dec.ttl <= 1)
- goto drop;
+ goto err;
dec.ttl -= 1;
+ /* Find the output device */
+ out_dev = rcu_dereference(nh->nh_dev);
+ if (!mpls_output_possible(out_dev))
+ goto tx_err;
+
/* Verify the destination can hold the packet */
new_header_size = mpls_nh_header_size(nh);
mtu = mpls_dev_mtu(out_dev);
if (mpls_pkt_too_big(skb, mtu - new_header_size))
- goto drop;
+ goto tx_err;
hh_len = LL_RESERVED_SPACE(out_dev);
if (!out_dev->header_ops)
@@ -319,7 +354,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
/* Ensure there is enough space for the headers in the skb */
if (skb_cow(skb, hh_len + new_header_size))
- goto drop;
+ goto tx_err;
skb->dev = out_dev;
skb->protocol = htons(ETH_P_MPLS_UC);
@@ -327,7 +362,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
if (unlikely(!new_header_size && dec.bos)) {
/* Penultimate hop popping */
if (!mpls_egress(rt, skb, dec))
- goto drop;
+ goto err;
} else {
bool bos;
int i;
@@ -343,6 +378,8 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
}
}
+ mpls_stats_inc_outucastpkts(out_dev, skb);
+
/* If via wasn't specified then send out using device address */
if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC)
err = neigh_xmit(NEIGH_LINK_TABLE, out_dev,
@@ -355,6 +392,13 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
__func__, err);
return 0;
+tx_err:
+ out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL;
+ if (out_mdev)
+ MPLS_INC_STATS(out_mdev, tx_errors);
+ goto drop;
+err:
+ MPLS_INC_STATS(mdev, rx_errors);
drop:
kfree_skb(skb);
return NET_RX_DROP;
@@ -853,15 +897,279 @@ errout:
return err;
}
+static void mpls_get_stats(struct mpls_dev *mdev,
+ struct mpls_link_stats *stats)
+{
+ struct mpls_pcpu_stats *p;
+ int i;
+
+ memset(stats, 0, sizeof(*stats));
+
+ for_each_possible_cpu(i) {
+ struct mpls_link_stats local;
+ unsigned int start;
+
+ p = per_cpu_ptr(mdev->stats, i);
+ do {
+ start = u64_stats_fetch_begin(&p->syncp);
+ local = p->stats;
+ } while (u64_stats_fetch_retry(&p->syncp, start));
+
+ stats->rx_packets += local.rx_packets;
+ stats->rx_bytes += local.rx_bytes;
+ stats->tx_packets += local.tx_packets;
+ stats->tx_bytes += local.tx_bytes;
+ stats->rx_errors += local.rx_errors;
+ stats->tx_errors += local.tx_errors;
+ stats->rx_dropped += local.rx_dropped;
+ stats->tx_dropped += local.tx_dropped;
+ stats->rx_noroute += local.rx_noroute;
+ }
+}
+
+static int mpls_fill_stats_af(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct mpls_link_stats *stats;
+ struct mpls_dev *mdev;
+ struct nlattr *nla;
+
+ mdev = mpls_dev_get(dev);
+ if (!mdev)
+ return -ENODATA;
+
+ nla = nla_reserve_64bit(skb, MPLS_STATS_LINK,
+ sizeof(struct mpls_link_stats),
+ MPLS_STATS_UNSPEC);
+ if (!nla)
+ return -EMSGSIZE;
+
+ stats = nla_data(nla);
+ mpls_get_stats(mdev, stats);
+
+ return 0;
+}
+
+static size_t mpls_get_stats_af_size(const struct net_device *dev)
+{
+ struct mpls_dev *mdev;
+
+ mdev = mpls_dev_get(dev);
+ if (!mdev)
+ return 0;
+
+ return nla_total_size_64bit(sizeof(struct mpls_link_stats));
+}
+
+static int mpls_netconf_fill_devconf(struct sk_buff *skb, struct mpls_dev *mdev,
+ u32 portid, u32 seq, int event,
+ unsigned int flags, int type)
+{
+ struct nlmsghdr *nlh;
+ struct netconfmsg *ncm;
+ bool all = false;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
+ flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ if (type == NETCONFA_ALL)
+ all = true;
+
+ ncm = nlmsg_data(nlh);
+ ncm->ncm_family = AF_MPLS;
+
+ if (nla_put_s32(skb, NETCONFA_IFINDEX, mdev->dev->ifindex) < 0)
+ goto nla_put_failure;
+
+ if ((all || type == NETCONFA_INPUT) &&
+ nla_put_s32(skb, NETCONFA_INPUT,
+ mdev->input_enabled) < 0)
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int mpls_netconf_msgsize_devconf(int type)
+{
+ int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
+ + nla_total_size(4); /* NETCONFA_IFINDEX */
+ bool all = false;
+
+ if (type == NETCONFA_ALL)
+ all = true;
+
+ if (all || type == NETCONFA_INPUT)
+ size += nla_total_size(4);
+
+ return size;
+}
+
+static void mpls_netconf_notify_devconf(struct net *net, int type,
+ struct mpls_dev *mdev)
+{
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(mpls_netconf_msgsize_devconf(type), GFP_KERNEL);
+ if (!skb)
+ goto errout;
+
+ err = mpls_netconf_fill_devconf(skb, mdev, 0, 0, RTM_NEWNETCONF,
+ 0, type);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_MPLS_NETCONF, NULL, GFP_KERNEL);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_MPLS_NETCONF, err);
+}
+
+static const struct nla_policy devconf_mpls_policy[NETCONFA_MAX + 1] = {
+ [NETCONFA_IFINDEX] = { .len = sizeof(int) },
+};
+
+static int mpls_netconf_get_devconf(struct sk_buff *in_skb,
+ struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct nlattr *tb[NETCONFA_MAX + 1];
+ struct netconfmsg *ncm;
+ struct net_device *dev;
+ struct mpls_dev *mdev;
+ struct sk_buff *skb;
+ int ifindex;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
+ devconf_mpls_policy);
+ if (err < 0)
+ goto errout;
+
+ err = -EINVAL;
+ if (!tb[NETCONFA_IFINDEX])
+ goto errout;
+
+ ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
+ dev = __dev_get_by_index(net, ifindex);
+ if (!dev)
+ goto errout;
+
+ mdev = mpls_dev_get(dev);
+ if (!mdev)
+ goto errout;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(mpls_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL);
+ if (!skb)
+ goto errout;
+
+ err = mpls_netconf_fill_devconf(skb, mdev,
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
+ NETCONFA_ALL);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+errout:
+ return err;
+}
+
+static int mpls_netconf_dump_devconf(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct hlist_head *head;
+ struct net_device *dev;
+ struct mpls_dev *mdev;
+ int idx, s_idx;
+ int h, s_h;
+
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ rcu_read_lock();
+ cb->seq = net->dev_base_seq;
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ mdev = mpls_dev_get(dev);
+ if (!mdev)
+ goto cont;
+ if (mpls_netconf_fill_devconf(skb, mdev,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF,
+ NLM_F_MULTI,
+ NETCONFA_ALL) < 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ rcu_read_unlock();
+ }
+done:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+
+ return skb->len;
+}
+
#define MPLS_PERDEV_SYSCTL_OFFSET(field) \
(&((struct mpls_dev *)0)->field)
+static int mpls_conf_proc(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int oval = *(int *)ctl->data;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+ if (write) {
+ struct mpls_dev *mdev = ctl->extra1;
+ int i = (int *)ctl->data - (int *)mdev;
+ struct net *net = ctl->extra2;
+ int val = *(int *)ctl->data;
+
+ if (i == offsetof(struct mpls_dev, input_enabled) &&
+ val != oval) {
+ mpls_netconf_notify_devconf(net,
+ NETCONFA_INPUT,
+ mdev);
+ }
+ }
+
+ return ret;
+}
+
static const struct ctl_table mpls_dev_table[] = {
{
.procname = "input",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = mpls_conf_proc,
.data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled),
},
{ }
@@ -871,6 +1179,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev,
struct mpls_dev *mdev)
{
char path[sizeof("net/mpls/conf/") + IFNAMSIZ];
+ struct net *net = dev_net(dev);
struct ctl_table *table;
int i;
@@ -881,8 +1190,11 @@ static int mpls_dev_sysctl_register(struct net_device *dev,
/* Table data contains only offsets relative to the base of
* the mdev at this point, so make them absolute.
*/
- for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++)
+ for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++) {
table[i].data = (char *)mdev + (uintptr_t)table[i].data;
+ table[i].extra1 = mdev;
+ table[i].extra2 = net;
+ }
snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name);
@@ -911,6 +1223,7 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
{
struct mpls_dev *mdev;
int err = -ENOMEM;
+ int i;
ASSERT_RTNL();
@@ -918,23 +1231,46 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
if (!mdev)
return ERR_PTR(err);
+ mdev->stats = alloc_percpu(struct mpls_pcpu_stats);
+ if (!mdev->stats)
+ goto free;
+
+ for_each_possible_cpu(i) {
+ struct mpls_pcpu_stats *mpls_stats;
+
+ mpls_stats = per_cpu_ptr(mdev->stats, i);
+ u64_stats_init(&mpls_stats->syncp);
+ }
+
err = mpls_dev_sysctl_register(dev, mdev);
if (err)
goto free;
+ mdev->dev = dev;
rcu_assign_pointer(dev->mpls_ptr, mdev);
return mdev;
free:
+ free_percpu(mdev->stats);
kfree(mdev);
return ERR_PTR(err);
}
+static void mpls_dev_destroy_rcu(struct rcu_head *head)
+{
+ struct mpls_dev *mdev = container_of(head, struct mpls_dev, rcu);
+
+ free_percpu(mdev->stats);
+ kfree(mdev);
+}
+
static void mpls_ifdown(struct net_device *dev, int event)
{
struct mpls_route __rcu **platform_label;
struct net *net = dev_net(dev);
+ unsigned int nh_flags = RTNH_F_DEAD | RTNH_F_LINKDOWN;
+ unsigned int alive;
unsigned index;
platform_label = rtnl_dereference(net->mpls.platform_label);
@@ -944,9 +1280,11 @@ static void mpls_ifdown(struct net_device *dev, int event)
if (!rt)
continue;
+ alive = 0;
change_nexthops(rt) {
if (rtnl_dereference(nh->nh_dev) != dev)
- continue;
+ goto next;
+
switch (event) {
case NETDEV_DOWN:
case NETDEV_UNREGISTER:
@@ -954,12 +1292,16 @@ static void mpls_ifdown(struct net_device *dev, int event)
/* fall through */
case NETDEV_CHANGE:
nh->nh_flags |= RTNH_F_LINKDOWN;
- ACCESS_ONCE(rt->rt_nhn_alive) = rt->rt_nhn_alive - 1;
break;
}
if (event == NETDEV_UNREGISTER)
RCU_INIT_POINTER(nh->nh_dev, NULL);
+next:
+ if (!(nh->nh_flags & nh_flags))
+ alive++;
} endfor_nexthops(rt);
+
+ WRITE_ONCE(rt->rt_nhn_alive, alive);
}
}
@@ -1045,7 +1387,7 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
if (mdev) {
mpls_dev_sysctl_unregister(mdev);
RCU_INIT_POINTER(dev->mpls_ptr, NULL);
- kfree_rcu(mdev, rcu);
+ call_rcu(&mdev->rcu, mpls_dev_destroy_rcu);
}
break;
case NETDEV_CHANGENAME:
@@ -1694,6 +2036,7 @@ static void mpls_net_exit(struct net *net)
for (index = 0; index < platform_labels; index++) {
struct mpls_route *rt = rtnl_dereference(platform_label[index]);
RCU_INIT_POINTER(platform_label[index], NULL);
+ mpls_notify_route(net, index, rt, NULL, NULL);
mpls_rt_free(rt);
}
rtnl_unlock();
@@ -1706,6 +2049,12 @@ static struct pernet_operations mpls_net_ops = {
.exit = mpls_net_exit,
};
+static struct rtnl_af_ops mpls_af_ops __read_mostly = {
+ .family = AF_MPLS,
+ .fill_stats_af = mpls_fill_stats_af,
+ .get_stats_af_size = mpls_get_stats_af_size,
+};
+
static int __init mpls_init(void)
{
int err;
@@ -1722,9 +2071,13 @@ static int __init mpls_init(void)
dev_add_pack(&mpls_packet_type);
+ rtnl_af_register(&mpls_af_ops);
+
rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL);
rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL);
rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL);
+ rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf,
+ mpls_netconf_dump_devconf, NULL);
err = 0;
out:
return err;
@@ -1738,6 +2091,7 @@ module_init(mpls_init);
static void __exit mpls_exit(void)
{
rtnl_unregister_all(PF_MPLS);
+ rtnl_af_unregister(&mpls_af_ops);
dev_remove_pack(&mpls_packet_type);
unregister_netdevice_notifier(&mpls_dev_notifier);
unregister_pernet_subsys(&mpls_net_ops);
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index bdfef6c3271a..76360d8b9579 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -9,13 +9,58 @@ struct mpls_entry_decoded {
u8 bos;
};
+struct mpls_pcpu_stats {
+ struct mpls_link_stats stats;
+ struct u64_stats_sync syncp;
+};
+
struct mpls_dev {
- int input_enabled;
+ int input_enabled;
+ struct net_device *dev;
+ struct mpls_pcpu_stats __percpu *stats;
- struct ctl_table_header *sysctl;
- struct rcu_head rcu;
+ struct ctl_table_header *sysctl;
+ struct rcu_head rcu;
};
+#if BITS_PER_LONG == 32
+
+#define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field) \
+ do { \
+ __typeof__(*(mdev)->stats) *ptr = \
+ raw_cpu_ptr((mdev)->stats); \
+ local_bh_disable(); \
+ u64_stats_update_begin(&ptr->syncp); \
+ ptr->stats.pkts_field++; \
+ ptr->stats.bytes_field += (len); \
+ u64_stats_update_end(&ptr->syncp); \
+ local_bh_enable(); \
+ } while (0)
+
+#define MPLS_INC_STATS(mdev, field) \
+ do { \
+ __typeof__(*(mdev)->stats) *ptr = \
+ raw_cpu_ptr((mdev)->stats); \
+ local_bh_disable(); \
+ u64_stats_update_begin(&ptr->syncp); \
+ ptr->stats.field++; \
+ u64_stats_update_end(&ptr->syncp); \
+ local_bh_enable(); \
+ } while (0)
+
+#else
+
+#define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field) \
+ do { \
+ this_cpu_inc((mdev)->stats->stats.pkts_field); \
+ this_cpu_add((mdev)->stats->stats.bytes_field, (len)); \
+ } while (0)
+
+#define MPLS_INC_STATS(mdev, field) \
+ this_cpu_inc((mdev)->stats->stats.field)
+
+#endif
+
struct sk_buff;
#define LABEL_NOT_SPECIFIED (1 << 20)
@@ -114,6 +159,11 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *
return result;
}
+static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
+{
+ return rcu_dereference_rtnl(dev->mpls_ptr);
+}
+
int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels,
const u32 label[]);
int nla_get_labels(const struct nlattr *nla, u32 max_labels, u8 *labels,
@@ -123,5 +173,7 @@ int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table,
bool mpls_output_possible(const struct net_device *dev);
unsigned int mpls_dev_mtu(const struct net_device *dev);
bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu);
+void mpls_stats_inc_outucastpkts(struct net_device *dev,
+ const struct sk_buff *skb);
#endif /* MPLS_INTERNAL_H */
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index cf52cf30ac4b..e4e4424f9eb1 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -48,11 +48,15 @@ static int mpls_xmit(struct sk_buff *skb)
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = NULL;
struct rt6_info *rt6 = NULL;
+ struct mpls_dev *out_mdev;
int err = 0;
bool bos;
int i;
unsigned int ttl;
+ /* Find the output device */
+ out_dev = dst->dev;
+
/* Obtain the ttl */
if (dst->ops->family == AF_INET) {
ttl = ip_hdr(skb)->ttl;
@@ -66,8 +70,6 @@ static int mpls_xmit(struct sk_buff *skb)
skb_orphan(skb);
- /* Find the output device */
- out_dev = dst->dev;
if (!mpls_output_possible(out_dev) ||
!dst->lwtstate || skb_warn_if_lro(skb))
goto drop;
@@ -109,6 +111,8 @@ static int mpls_xmit(struct sk_buff *skb)
bos = false;
}
+ mpls_stats_inc_outucastpkts(out_dev, skb);
+
if (rt)
err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway,
skb);
@@ -122,18 +126,20 @@ static int mpls_xmit(struct sk_buff *skb)
return LWTUNNEL_XMIT_DONE;
drop:
+ out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL;
+ if (out_mdev)
+ MPLS_INC_STATS(out_mdev, tx_errors);
kfree_skb(skb);
return -EINVAL;
}
-static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
+static int mpls_build_state(struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts)
{
struct mpls_iptunnel_encap *tun_encap_info;
struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1];
struct lwtunnel_state *newts;
- int tun_encap_info_len;
int ret;
ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla,
@@ -144,13 +150,11 @@ static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
if (!tb[MPLS_IPTUNNEL_DST])
return -EINVAL;
- tun_encap_info_len = sizeof(*tun_encap_info);
- newts = lwtunnel_state_alloc(tun_encap_info_len);
+ newts = lwtunnel_state_alloc(sizeof(*tun_encap_info));
if (!newts)
return -ENOMEM;
- newts->len = tun_encap_info_len;
tun_encap_info = mpls_lwtunnel_encap(newts);
ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS,
&tun_encap_info->labels, tun_encap_info->label);
@@ -218,6 +222,7 @@ static const struct lwtunnel_encap_ops mpls_iptun_ops = {
.fill_encap = mpls_fill_encap_info,
.get_encap_size = mpls_encap_nlsize,
.cmp_encap = mpls_encap_cmp,
+ .owner = THIS_MODULE,
};
static int __init mpls_iptunnel_init(void)
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e8d56d9a4df2..9b28864cc36a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -57,6 +57,10 @@ config NF_CONNTRACK
config NF_LOG_COMMON
tristate
+config NF_LOG_NETDEV
+ tristate "Netdev packet logging"
+ select NF_LOG_COMMON
+
if NF_CONNTRACK
config NF_CONNTRACK_MARK
@@ -142,38 +146,39 @@ config NF_CONNTRACK_LABELS
to connection tracking entries. It selected by the connlabel match.
config NF_CT_PROTO_DCCP
- tristate 'DCCP protocol connection tracking support'
+ bool 'DCCP protocol connection tracking support'
depends on NETFILTER_ADVANCED
- default IP_DCCP
+ default y
help
With this option enabled, the layer 3 independent connection
tracking code will be able to do state tracking on DCCP connections.
- If unsure, say 'N'.
+ If unsure, say Y.
config NF_CT_PROTO_GRE
tristate
config NF_CT_PROTO_SCTP
- tristate 'SCTP protocol connection tracking support'
+ bool 'SCTP protocol connection tracking support'
depends on NETFILTER_ADVANCED
- default IP_SCTP
+ default y
+ select LIBCRC32C
help
With this option enabled, the layer 3 independent connection
tracking code will be able to do state tracking on SCTP connections.
- If you want to compile it as a module, say M here and read
- <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+ If unsure, say Y.
config NF_CT_PROTO_UDPLITE
- tristate 'UDP-Lite protocol connection tracking support'
+ bool 'UDP-Lite protocol connection tracking support'
depends on NETFILTER_ADVANCED
+ default y
help
With this option enabled, the layer 3 independent connection
tracking code will be able to do state tracking on UDP-Lite
connections.
- To compile it as a module, choose M here. If unsure, say N.
+ If unsure, say Y.
config NF_CONNTRACK_AMANDA
tristate "Amanda backup protocol support"
@@ -380,20 +385,19 @@ config NF_NAT_NEEDED
default y
config NF_NAT_PROTO_DCCP
- tristate
+ bool
depends on NF_NAT && NF_CT_PROTO_DCCP
default NF_NAT && NF_CT_PROTO_DCCP
config NF_NAT_PROTO_UDPLITE
- tristate
+ bool
depends on NF_NAT && NF_CT_PROTO_UDPLITE
default NF_NAT && NF_CT_PROTO_UDPLITE
config NF_NAT_PROTO_SCTP
- tristate
+ bool
default NF_NAT && NF_CT_PROTO_SCTP
depends on NF_NAT && NF_CT_PROTO_SCTP
- select LIBCRC32C
config NF_NAT_AMANDA
tristate
@@ -463,10 +467,10 @@ config NF_TABLES_NETDEV
This option enables support for the "netdev" table.
config NFT_EXTHDR
- tristate "Netfilter nf_tables IPv6 exthdr module"
+ tristate "Netfilter nf_tables exthdr module"
help
This option adds the "exthdr" expression that you can use to match
- IPv6 extension headers.
+ IPv6 extension headers and tcp options.
config NFT_META
tristate "Netfilter nf_tables meta module"
@@ -474,6 +478,12 @@ config NFT_META
This option adds the "meta" expression that you can use to match and
to set packet metainformation such as the packet mark.
+config NFT_RT
+ tristate "Netfilter nf_tables routing module"
+ help
+ This option adds the "rt" expression that you can use to match
+ packet routing information such as the packet nexthop.
+
config NFT_NUMGEN
tristate "Netfilter nf_tables number generator module"
help
@@ -484,7 +494,7 @@ config NFT_CT
depends on NF_CONNTRACK
tristate "Netfilter nf_tables conntrack module"
help
- This option adds the "meta" expression that you can use to match
+ This option adds the "ct" expression that you can use to match
connection tracking information such as the flow state.
config NFT_SET_RBTREE
@@ -499,6 +509,12 @@ config NFT_SET_HASH
This option adds the "hash" set type that is used to build one-way
mappings between matchings and actions.
+config NFT_SET_BITMAP
+ tristate "Netfilter nf_tables bitmap set module"
+ help
+ This option adds the "bitmap" set type that is used to build sets
+ whose keys are smaller or equal to 16 bits.
+
config NFT_COUNTER
tristate "Netfilter nf_tables counter module"
help
@@ -541,6 +557,12 @@ config NFT_NAT
This option adds the "nat" expression that you can use to perform
typical Network Address Translation (NAT) packet transformations.
+config NFT_OBJREF
+ tristate "Netfilter nf_tables stateful object reference module"
+ help
+ This option adds the "objref" expression that allows you to refer to
+ stateful objects, such as counters and quotas.
+
config NFT_QUEUE
depends on NETFILTER_NETLINK_QUEUE
tristate "Netfilter nf_tables queue module"
@@ -581,6 +603,19 @@ config NFT_HASH
This option adds the "hash" expression that you can use to perform
a hash operation on registers.
+config NFT_FIB
+ tristate
+
+config NFT_FIB_INET
+ depends on NF_TABLES_INET
+ depends on NFT_FIB_IPV4
+ depends on NFT_FIB_IPV6
+ tristate "Netfilter nf_tables fib inet support"
+ help
+ This option allows using the FIB expression from the inet table.
+ The lookup will be delegated to the IPv4 or IPv6 FIB depending
+ on the protocol of the packet.
+
if NF_TABLES_NETDEV
config NF_DUP_NETDEV
@@ -1409,9 +1444,10 @@ config NETFILTER_XT_MATCH_SOCKET
tristate '"socket" match support'
depends on NETFILTER_XTABLES
depends on NETFILTER_ADVANCED
- depends on !NF_CONNTRACK || NF_CONNTRACK
depends on IPV6 || IPV6=n
depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
+ depends on NF_SOCKET_IPV4
+ depends on NF_SOCKET_IPV6
select NF_DEFRAG_IPV4
select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index c23c3c84416f..c9b78e7b342f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -5,6 +5,8 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
+nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
+nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
obj-$(CONFIG_NETFILTER) = netfilter.o
@@ -16,11 +18,7 @@ obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
# connection tracking
obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
-# SCTP protocol connection tracking
-obj-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
obj-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
-obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
-obj-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o
# netlink interface for nf_conntrack
obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o
@@ -45,17 +43,19 @@ obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o
nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \
nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o
+# NAT protocols (nf_nat)
+nf_nat-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
+nf_nat-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
+
# generic transport layer logging
obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
+# packet logging for netdev family
+obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o
+
obj-$(CONFIG_NF_NAT) += nf_nat.o
obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
-# NAT protocols (nf_nat)
-obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
-obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
-obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
-
# NAT helpers
obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
@@ -81,21 +81,26 @@ obj-$(CONFIG_NF_TABLES_NETDEV) += nf_tables_netdev.o
obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o
obj-$(CONFIG_NFT_META) += nft_meta.o
+obj-$(CONFIG_NFT_RT) += nft_rt.o
obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o
obj-$(CONFIG_NFT_CT) += nft_ct.o
obj-$(CONFIG_NFT_LIMIT) += nft_limit.o
obj-$(CONFIG_NFT_NAT) += nft_nat.o
+obj-$(CONFIG_NFT_OBJREF) += nft_objref.o
obj-$(CONFIG_NFT_QUEUE) += nft_queue.o
obj-$(CONFIG_NFT_QUOTA) += nft_quota.o
obj-$(CONFIG_NFT_REJECT) += nft_reject.o
obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o
obj-$(CONFIG_NFT_SET_RBTREE) += nft_set_rbtree.o
obj-$(CONFIG_NFT_SET_HASH) += nft_set_hash.o
+obj-$(CONFIG_NFT_SET_BITMAP) += nft_set_bitmap.o
obj-$(CONFIG_NFT_COUNTER) += nft_counter.o
obj-$(CONFIG_NFT_LOG) += nft_log.o
obj-$(CONFIG_NFT_MASQ) += nft_masq.o
obj-$(CONFIG_NFT_REDIR) += nft_redir.o
obj-$(CONFIG_NFT_HASH) += nft_hash.o
+obj-$(CONFIG_NFT_FIB) += nft_fib.o
+obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o
# nf_tables netdev
obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 004af030ef1a..a87a6f8a74d8 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -102,17 +102,14 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
if (!entry)
return -ENOMEM;
- entry->orig_ops = reg;
- entry->ops = *reg;
- entry->next = NULL;
+ nf_hook_entry_init(entry, reg);
mutex_lock(&nf_hook_mutex);
/* Find the spot in the list */
- while ((p = nf_entry_dereference(*pp)) != NULL) {
- if (reg->priority < p->orig_ops->priority)
+ for (; (p = nf_entry_dereference(*pp)) != NULL; pp = &p->next) {
+ if (reg->priority < nf_hook_entry_priority(p))
break;
- pp = &p->next;
}
rcu_assign_pointer(entry->next, p);
rcu_assign_pointer(*pp, entry);
@@ -139,12 +136,11 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
return;
mutex_lock(&nf_hook_mutex);
- while ((p = nf_entry_dereference(*pp)) != NULL) {
- if (p->orig_ops == reg) {
+ for (; (p = nf_entry_dereference(*pp)) != NULL; pp = &p->next) {
+ if (nf_hook_entry_ops(p) == reg) {
rcu_assign_pointer(*pp, p->next);
break;
}
- pp = &p->next;
}
mutex_unlock(&nf_hook_mutex);
if (!p) {
@@ -302,70 +298,40 @@ void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
}
EXPORT_SYMBOL(_nf_unregister_hooks);
-unsigned int nf_iterate(struct sk_buff *skb,
- struct nf_hook_state *state,
- struct nf_hook_entry **entryp)
+/* Returns 1 if okfn() needs to be executed by the caller,
+ * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */
+int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
+ struct nf_hook_entry *entry)
{
unsigned int verdict;
+ int ret;
- /*
- * The caller must not block between calls to this
- * function because of risk of continuing from deleted element.
- */
- while (*entryp) {
- if (state->thresh > (*entryp)->ops.priority) {
- *entryp = rcu_dereference((*entryp)->next);
- continue;
- }
-
- /* Optimization: we don't need to hold module
- reference here, since function can't sleep. --RR */
-repeat:
- verdict = (*entryp)->ops.hook((*entryp)->ops.priv, skb, state);
- if (verdict != NF_ACCEPT) {
-#ifdef CONFIG_NETFILTER_DEBUG
- if (unlikely((verdict & NF_VERDICT_MASK)
- > NF_MAX_VERDICT)) {
- NFDEBUG("Evil return from %p(%u).\n",
- (*entryp)->ops.hook, state->hook);
- *entryp = rcu_dereference((*entryp)->next);
+ do {
+ verdict = nf_hook_entry_hookfn(entry, skb, state);
+ switch (verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ entry = rcu_dereference(entry->next);
+ break;
+ case NF_DROP:
+ kfree_skb(skb);
+ ret = NF_DROP_GETERR(verdict);
+ if (ret == 0)
+ ret = -EPERM;
+ return ret;
+ case NF_QUEUE:
+ ret = nf_queue(skb, state, &entry, verdict);
+ if (ret == 1 && entry)
continue;
- }
-#endif
- if (verdict != NF_REPEAT)
- return verdict;
- goto repeat;
+ return ret;
+ default:
+ /* Implicit handling for NF_STOLEN, as well as any other
+ * non conventional verdicts.
+ */
+ return 0;
}
- *entryp = rcu_dereference((*entryp)->next);
- }
- return NF_ACCEPT;
-}
-
+ } while (entry);
-/* Returns 1 if okfn() needs to be executed by the caller,
- * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */
-int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state)
-{
- struct nf_hook_entry *entry;
- unsigned int verdict;
- int ret = 0;
-
- entry = rcu_dereference(state->hook_entries);
-next_hook:
- verdict = nf_iterate(skb, state, &entry);
- if (verdict == NF_ACCEPT || verdict == NF_STOP) {
- ret = 1;
- } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
- kfree_skb(skb);
- ret = NF_DROP_GETERR(verdict);
- if (ret == 0)
- ret = -EPERM;
- } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
- ret = nf_queue(skb, state, &entry, verdict);
- if (ret == 1 && entry)
- goto next_hook;
- }
- return ret;
+ return 1;
}
EXPORT_SYMBOL(nf_hook_slow);
@@ -409,7 +375,7 @@ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
{
void (*attach)(struct sk_buff *, const struct sk_buff *);
- if (skb->nfct) {
+ if (skb->_nfct) {
rcu_read_lock();
attach = rcu_dereference(ip_ct_attach);
if (attach)
diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig
index 234a8ec82076..4083a8051f0f 100644
--- a/net/netfilter/ipset/Kconfig
+++ b/net/netfilter/ipset/Kconfig
@@ -99,6 +99,15 @@ config IP_SET_HASH_IPPORTNET
To compile it as a module, choose M here. If unsure, say N.
+config IP_SET_HASH_IPMAC
+ tristate "hash:ip,mac set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,mac set type support, by which
+ one can store IPv4/IPv6 address and MAC (ethernet address) pairs in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config IP_SET_HASH_MAC
tristate "hash:mac set support"
depends on IP_SET
diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile
index 3dbd5e958489..28ec148df02d 100644
--- a/net/netfilter/ipset/Makefile
+++ b/net/netfilter/ipset/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o
# hash types
obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o
+obj-$(CONFIG_IP_SET_HASH_IPMAC) += ip_set_hash_ipmac.o
obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o
obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o
obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 2e8e7e5fb4a6..6f09a99298cd 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -22,6 +22,7 @@
#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt)
#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt)
#define mtype_destroy IPSET_TOKEN(MTYPE, _destroy)
+#define mtype_memsize IPSET_TOKEN(MTYPE, _memsize)
#define mtype_flush IPSET_TOKEN(MTYPE, _flush)
#define mtype_head IPSET_TOKEN(MTYPE, _head)
#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set)
@@ -40,11 +41,8 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
{
struct mtype *map = set->data;
- init_timer(&map->gc);
- map->gc.data = (unsigned long)set;
- map->gc.function = gc;
- map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
- add_timer(&map->gc);
+ setup_timer(&map->gc, gc, (unsigned long)set);
+ mod_timer(&map->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
}
static void
@@ -82,6 +80,16 @@ mtype_flush(struct ip_set *set)
if (set->extensions & IPSET_EXT_DESTROY)
mtype_ext_cleanup(set);
memset(map->members, 0, map->memsize);
+ set->elements = 0;
+ set->ext_size = 0;
+}
+
+/* Calculate the actual memory size of the set data */
+static size_t
+mtype_memsize(const struct mtype *map, size_t dsize)
+{
+ return sizeof(*map) + map->memsize +
+ map->elements * dsize;
}
static int
@@ -89,14 +97,15 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
{
const struct mtype *map = set->data;
struct nlattr *nested;
- size_t memsize = sizeof(*map) + map->memsize;
+ size_t memsize = mtype_memsize(map, set->dsize) + set->ext_size;
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
goto nla_put_failure;
if (mtype_do_head(skb, map) ||
nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
- nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
+ nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
goto nla_put_failure;
@@ -140,6 +149,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (ret == IPSET_ADD_FAILED) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(x, set))) {
+ set->elements--;
ret = 0;
} else if (!(flags & IPSET_FLAG_EXIST)) {
set_bit(e->id, map->members);
@@ -148,6 +158,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
/* Element is re-added, cleanup extensions */
ip_set_ext_destroy(set, x);
}
+ if (ret > 0)
+ set->elements--;
if (SET_WITH_TIMEOUT(set))
#ifdef IP_SET_BITMAP_STORED_TIMEOUT
@@ -159,12 +171,13 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (SET_WITH_COUNTER(set))
ip_set_init_counter(ext_counter(x, set), ext);
if (SET_WITH_COMMENT(set))
- ip_set_init_comment(ext_comment(x, set), ext);
+ ip_set_init_comment(set, ext_comment(x, set), ext);
if (SET_WITH_SKBINFO(set))
ip_set_init_skbinfo(ext_skbinfo(x, set), ext);
/* Activate element */
set_bit(e->id, map->members);
+ set->elements++;
return 0;
}
@@ -181,6 +194,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
return -IPSET_ERR_EXIST;
ip_set_ext_destroy(set, x);
+ set->elements--;
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(x, set)))
return -IPSET_ERR_EXIST;
@@ -276,6 +290,7 @@ mtype_gc(unsigned long ul_set)
if (ip_set_timeout_expired(ext_timeout(x, set))) {
clear_bit(id, map->members);
ip_set_ext_destroy(set, x);
+ set->elements--;
}
}
spin_unlock_bh(&set->lock);
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index a748b0c2c981..c296f9b606d4 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -36,7 +36,7 @@ struct ip_set_net {
bool is_destroyed; /* all sets are destroyed */
};
-static int ip_set_net_id __read_mostly;
+static unsigned int ip_set_net_id __read_mostly;
static inline struct ip_set_net *ip_set_pernet(struct net *net)
{
@@ -324,7 +324,7 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
}
EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
-typedef void (*destroyer)(void *);
+typedef void (*destroyer)(struct ip_set *, void *);
/* ipset data extension types, in size order */
const struct ip_set_ext_type ip_set_extensions[] = {
@@ -426,20 +426,20 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
if (!SET_WITH_SKBINFO(set))
return -IPSET_ERR_SKBINFO;
fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK]));
- ext->skbmark = fullmark >> 32;
- ext->skbmarkmask = fullmark & 0xffffffff;
+ ext->skbinfo.skbmark = fullmark >> 32;
+ ext->skbinfo.skbmarkmask = fullmark & 0xffffffff;
}
if (tb[IPSET_ATTR_SKBPRIO]) {
if (!SET_WITH_SKBINFO(set))
return -IPSET_ERR_SKBINFO;
- ext->skbprio = be32_to_cpu(nla_get_be32(
- tb[IPSET_ATTR_SKBPRIO]));
+ ext->skbinfo.skbprio =
+ be32_to_cpu(nla_get_be32(tb[IPSET_ATTR_SKBPRIO]));
}
if (tb[IPSET_ATTR_SKBQUEUE]) {
if (!SET_WITH_SKBINFO(set))
return -IPSET_ERR_SKBINFO;
- ext->skbqueue = be16_to_cpu(nla_get_be16(
- tb[IPSET_ATTR_SKBQUEUE]));
+ ext->skbinfo.skbqueue =
+ be16_to_cpu(nla_get_be16(tb[IPSET_ATTR_SKBQUEUE]));
}
return 0;
}
@@ -541,7 +541,7 @@ int
ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
const struct xt_action_param *par, struct ip_set_adt_opt *opt)
{
- struct ip_set *set = ip_set_rcu_get(par->net, index);
+ struct ip_set *set = ip_set_rcu_get(xt_net(par), index);
int ret = 0;
BUG_ON(!set);
@@ -579,7 +579,7 @@ int
ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
const struct xt_action_param *par, struct ip_set_adt_opt *opt)
{
- struct ip_set *set = ip_set_rcu_get(par->net, index);
+ struct ip_set *set = ip_set_rcu_get(xt_net(par), index);
int ret;
BUG_ON(!set);
@@ -601,7 +601,7 @@ int
ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
const struct xt_action_param *par, struct ip_set_adt_opt *opt)
{
- struct ip_set *set = ip_set_rcu_get(par->net, index);
+ struct ip_set *set = ip_set_rcu_get(xt_net(par), index);
int ret = 0;
BUG_ON(!set);
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index d32fd6b036bf..f236c0bc7b3f 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -85,6 +85,8 @@ struct htable {
};
#define hbucket(h, i) ((h)->bucket[i])
+#define ext_size(n, dsize) \
+ (sizeof(struct hbucket) + (n) * (dsize))
#ifndef IPSET_NET_COUNT
#define IPSET_NET_COUNT 1
@@ -150,24 +152,34 @@ htable_bits(u32 hashsize)
#define INIT_CIDR(cidr, host_mask) \
DCIDR_PUT(((cidr) ? NCIDR_GET(cidr) : host_mask))
-#define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128)
-
#ifdef IP_SET_HASH_WITH_NET0
-/* cidr from 0 to SET_HOST_MASK() value and c = cidr + 1 */
-#define NLEN(family) (SET_HOST_MASK(family) + 1)
+/* cidr from 0 to HOST_MASK value and c = cidr + 1 */
+#define NLEN (HOST_MASK + 1)
#define CIDR_POS(c) ((c) - 1)
#else
-/* cidr from 1 to SET_HOST_MASK() value and c = cidr + 1 */
-#define NLEN(family) SET_HOST_MASK(family)
+/* cidr from 1 to HOST_MASK value and c = cidr + 1 */
+#define NLEN HOST_MASK
#define CIDR_POS(c) ((c) - 2)
#endif
#else
-#define NLEN(family) 0
+#define NLEN 0
#endif /* IP_SET_HASH_WITH_NETS */
#endif /* _IP_SET_HASH_GEN_H */
+#ifndef MTYPE
+#error "MTYPE is not defined!"
+#endif
+
+#ifndef HTYPE
+#error "HTYPE is not defined!"
+#endif
+
+#ifndef HOST_MASK
+#error "HOST_MASK is not defined!"
+#endif
+
/* Family dependent templates */
#undef ahash_data
@@ -191,7 +203,6 @@ htable_bits(u32 hashsize)
#undef mtype_same_set
#undef mtype_kadt
#undef mtype_uadt
-#undef mtype
#undef mtype_add
#undef mtype_del
@@ -207,6 +218,7 @@ htable_bits(u32 hashsize)
#undef mtype_variant
#undef mtype_data_match
+#undef htype
#undef HKEY
#define mtype_data_equal IPSET_TOKEN(MTYPE, _data_equal)
@@ -233,7 +245,6 @@ htable_bits(u32 hashsize)
#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set)
#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt)
#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt)
-#define mtype MTYPE
#define mtype_add IPSET_TOKEN(MTYPE, _add)
#define mtype_del IPSET_TOKEN(MTYPE, _del)
@@ -249,62 +260,54 @@ htable_bits(u32 hashsize)
#define mtype_variant IPSET_TOKEN(MTYPE, _variant)
#define mtype_data_match IPSET_TOKEN(MTYPE, _data_match)
-#ifndef MTYPE
-#error "MTYPE is not defined!"
-#endif
-
-#ifndef HOST_MASK
-#error "HOST_MASK is not defined!"
-#endif
-
#ifndef HKEY_DATALEN
#define HKEY_DATALEN sizeof(struct mtype_elem)
#endif
-#define HKEY(data, initval, htable_bits) \
-(jhash2((u32 *)(data), HKEY_DATALEN / sizeof(u32), initval) \
- & jhash_mask(htable_bits))
+#define htype MTYPE
-#ifndef htype
-#ifndef HTYPE
-#error "HTYPE is not defined!"
-#endif /* HTYPE */
-#define htype HTYPE
+#define HKEY(data, initval, htable_bits) \
+({ \
+ const u32 *__k = (const u32 *)data; \
+ u32 __l = HKEY_DATALEN / sizeof(u32); \
+ \
+ BUILD_BUG_ON(HKEY_DATALEN % sizeof(u32) != 0); \
+ \
+ jhash2(__k, __l, initval) & jhash_mask(htable_bits); \
+})
/* The generic hash structure */
struct htype {
struct htable __rcu *table; /* the hash table */
+ struct timer_list gc; /* garbage collection when timeout enabled */
u32 maxelem; /* max elements in the hash */
- u32 elements; /* current element (vs timeout) */
u32 initval; /* random jhash init value */
#ifdef IP_SET_HASH_WITH_MARKMASK
u32 markmask; /* markmask value for mark mask to store */
#endif
- struct timer_list gc; /* garbage collection when timeout enabled */
- struct mtype_elem next; /* temporary storage for uadd */
#ifdef IP_SET_HASH_WITH_MULTI
u8 ahash_max; /* max elements in an array block */
#endif
#ifdef IP_SET_HASH_WITH_NETMASK
u8 netmask; /* netmask value for subnets to store */
#endif
+ struct mtype_elem next; /* temporary storage for uadd */
#ifdef IP_SET_HASH_WITH_NETS
- struct net_prefixes nets[0]; /* book-keeping of prefixes */
+ struct net_prefixes nets[NLEN]; /* book-keeping of prefixes */
#endif
};
-#endif /* htype */
#ifdef IP_SET_HASH_WITH_NETS
/* Network cidr size book keeping when the hash stores different
* sized networks. cidr == real cidr + 1 to support /0.
*/
static void
-mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
+mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
{
int i, j;
/* Add in increasing prefix order, so larger cidr first */
- for (i = 0, j = -1; i < nets_length && h->nets[i].cidr[n]; i++) {
+ for (i = 0, j = -1; i < NLEN && h->nets[i].cidr[n]; i++) {
if (j != -1) {
continue;
} else if (h->nets[i].cidr[n] < cidr) {
@@ -323,11 +326,11 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
}
static void
-mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
+mtype_del_cidr(struct htype *h, u8 cidr, u8 n)
{
- u8 i, j, net_end = nets_length - 1;
+ u8 i, j, net_end = NLEN - 1;
- for (i = 0; i < nets_length; i++) {
+ for (i = 0; i < NLEN; i++) {
if (h->nets[i].cidr[n] != cidr)
continue;
h->nets[CIDR_POS(cidr)].nets[n]--;
@@ -343,24 +346,9 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
/* Calculate the actual memory size of the set data */
static size_t
-mtype_ahash_memsize(const struct htype *h, const struct htable *t,
- u8 nets_length, size_t dsize)
+mtype_ahash_memsize(const struct htype *h, const struct htable *t)
{
- u32 i;
- struct hbucket *n;
- size_t memsize = sizeof(*h) + sizeof(*t);
-
-#ifdef IP_SET_HASH_WITH_NETS
- memsize += sizeof(struct net_prefixes) * nets_length;
-#endif
- for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = rcu_dereference_bh(hbucket(t, i));
- if (!n)
- continue;
- memsize += sizeof(struct hbucket) + n->size * dsize;
- }
-
- return memsize;
+ return sizeof(*h) + sizeof(*t);
}
/* Get the ith element from the array block n */
@@ -398,9 +386,10 @@ mtype_flush(struct ip_set *set)
kfree_rcu(n, rcu);
}
#ifdef IP_SET_HASH_WITH_NETS
- memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family));
+ memset(h->nets, 0, sizeof(h->nets));
#endif
- h->elements = 0;
+ set->elements = 0;
+ set->ext_size = 0;
}
/* Destroy the hashtable part of the set */
@@ -444,11 +433,8 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
{
struct htype *h = set->data;
- init_timer(&h->gc);
- h->gc.data = (unsigned long)set;
- h->gc.function = gc;
- h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
- add_timer(&h->gc);
+ setup_timer(&h->gc, gc, (unsigned long)set);
+ mod_timer(&h->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
pr_debug("gc initialized, run in every %u\n",
IPSET_GC_PERIOD(set->timeout));
}
@@ -473,12 +459,13 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b)
/* Delete expired elements from the hashtable */
static void
-mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
+mtype_expire(struct ip_set *set, struct htype *h)
{
struct htable *t;
struct hbucket *n, *tmp;
struct mtype_elem *data;
u32 i, j, d;
+ size_t dsize = set->dsize;
#ifdef IP_SET_HASH_WITH_NETS
u8 k;
#endif
@@ -494,21 +481,20 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
continue;
}
data = ahash_data(n, j, dsize);
- if (ip_set_timeout_expired(ext_timeout(data, set))) {
- pr_debug("expired %u/%u\n", i, j);
- clear_bit(j, n->used);
- smp_mb__after_atomic();
+ if (!ip_set_timeout_expired(ext_timeout(data, set)))
+ continue;
+ pr_debug("expired %u/%u\n", i, j);
+ clear_bit(j, n->used);
+ smp_mb__after_atomic();
#ifdef IP_SET_HASH_WITH_NETS
- for (k = 0; k < IPSET_NET_COUNT; k++)
- mtype_del_cidr(h,
- NCIDR_PUT(DCIDR_GET(data->cidr,
- k)),
- nets_length, k);
+ for (k = 0; k < IPSET_NET_COUNT; k++)
+ mtype_del_cidr(h,
+ NCIDR_PUT(DCIDR_GET(data->cidr, k)),
+ k);
#endif
- ip_set_ext_destroy(set, data);
- h->elements--;
- d++;
- }
+ ip_set_ext_destroy(set, data);
+ set->elements--;
+ d++;
}
if (d >= AHASH_INIT_SIZE) {
if (d >= n->size) {
@@ -532,6 +518,7 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
d++;
}
tmp->pos = d;
+ set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
rcu_assign_pointer(hbucket(t, i), tmp);
kfree_rcu(n, rcu);
}
@@ -546,7 +533,7 @@ mtype_gc(unsigned long ul_set)
pr_debug("called\n");
spin_lock_bh(&set->lock);
- mtype_expire(set, h, NLEN(set->family), set->dsize);
+ mtype_expire(set, h);
spin_unlock_bh(&set->lock);
h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
@@ -563,7 +550,7 @@ mtype_resize(struct ip_set *set, bool retried)
struct htype *h = set->data;
struct htable *t, *orig;
u8 htable_bits;
- size_t dsize = set->dsize;
+ size_t extsize, dsize = set->dsize;
#ifdef IP_SET_HASH_WITH_NETS
u8 flags;
struct mtype_elem *tmp;
@@ -606,6 +593,7 @@ retry:
/* There can't be another parallel resizing, but dumping is possible */
atomic_set(&orig->ref, 1);
atomic_inc(&orig->uref);
+ extsize = 0;
pr_debug("attempt to resize set %s from %u to %u, t %p\n",
set->name, orig->htable_bits, htable_bits, orig);
for (i = 0; i < jhash_size(orig->htable_bits); i++) {
@@ -636,6 +624,7 @@ retry:
goto cleanup;
}
m->size = AHASH_INIT_SIZE;
+ extsize = ext_size(AHASH_INIT_SIZE, dsize);
RCU_INIT_POINTER(hbucket(t, key), m);
} else if (m->pos >= m->size) {
struct hbucket *ht;
@@ -655,6 +644,7 @@ retry:
memcpy(ht, m, sizeof(struct hbucket) +
m->size * dsize);
ht->size = m->size + AHASH_INIT_SIZE;
+ extsize += ext_size(AHASH_INIT_SIZE, dsize);
kfree(m);
m = ht;
RCU_INIT_POINTER(hbucket(t, key), ht);
@@ -668,6 +658,7 @@ retry:
}
}
rcu_assign_pointer(h->table, t);
+ set->ext_size = extsize;
spin_unlock_bh(&set->lock);
@@ -715,11 +706,11 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
bool deleted = false, forceadd = false, reuse = false;
u32 key, multi = 0;
- if (h->elements >= h->maxelem) {
+ if (set->elements >= h->maxelem) {
if (SET_WITH_TIMEOUT(set))
/* FIXME: when set is full, we slow down here */
- mtype_expire(set, h, NLEN(set->family), set->dsize);
- if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set))
+ mtype_expire(set, h);
+ if (set->elements >= h->maxelem && SET_WITH_FORCEADD(set))
forceadd = true;
}
@@ -727,20 +718,15 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
key = HKEY(value, h->initval, t->htable_bits);
n = __ipset_dereference_protected(hbucket(t, key), 1);
if (!n) {
- if (forceadd) {
- if (net_ratelimit())
- pr_warn("Set %s is full, maxelem %u reached\n",
- set->name, h->maxelem);
- return -IPSET_ERR_HASH_FULL;
- } else if (h->elements >= h->maxelem) {
+ if (forceadd || set->elements >= h->maxelem)
goto set_full;
- }
old = NULL;
n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize,
GFP_ATOMIC);
if (!n)
return -ENOMEM;
n->size = AHASH_INIT_SIZE;
+ set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
goto copy_elem;
}
for (i = 0; i < n->pos; i++) {
@@ -778,14 +764,14 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
for (i = 0; i < IPSET_NET_COUNT; i++)
mtype_del_cidr(h,
NCIDR_PUT(DCIDR_GET(data->cidr, i)),
- NLEN(set->family), i);
+ i);
#endif
ip_set_ext_destroy(set, data);
- h->elements--;
+ set->elements--;
}
goto copy_data;
}
- if (h->elements >= h->maxelem)
+ if (set->elements >= h->maxelem)
goto set_full;
/* Create a new slot */
if (n->pos >= n->size) {
@@ -804,17 +790,17 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
memcpy(n, old, sizeof(struct hbucket) +
old->size * set->dsize);
n->size = old->size + AHASH_INIT_SIZE;
+ set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
}
copy_elem:
j = n->pos++;
data = ahash_data(n, j, set->dsize);
copy_data:
- h->elements++;
+ set->elements++;
#ifdef IP_SET_HASH_WITH_NETS
for (i = 0; i < IPSET_NET_COUNT; i++)
- mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)),
- NLEN(set->family), i);
+ mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i);
#endif
memcpy(data, d, sizeof(struct mtype_elem));
overwrite_extensions:
@@ -824,7 +810,7 @@ overwrite_extensions:
if (SET_WITH_COUNTER(set))
ip_set_init_counter(ext_counter(data, set), ext);
if (SET_WITH_COMMENT(set))
- ip_set_init_comment(ext_comment(data, set), ext);
+ ip_set_init_comment(set, ext_comment(data, set), ext);
if (SET_WITH_SKBINFO(set))
ip_set_init_skbinfo(ext_skbinfo(data, set), ext);
/* Must come last for the case when timed out entry is reused */
@@ -883,11 +869,11 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
smp_mb__after_atomic();
if (i + 1 == n->pos)
n->pos--;
- h->elements--;
+ set->elements--;
#ifdef IP_SET_HASH_WITH_NETS
for (j = 0; j < IPSET_NET_COUNT; j++)
mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)),
- NLEN(set->family), j);
+ j);
#endif
ip_set_ext_destroy(set, data);
@@ -896,6 +882,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
k++;
}
if (n->pos == 0 && k == 0) {
+ set->ext_size -= ext_size(n->size, dsize);
rcu_assign_pointer(hbucket(t, key), NULL);
kfree_rcu(n, rcu);
} else if (k >= AHASH_INIT_SIZE) {
@@ -910,10 +897,11 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
continue;
data = ahash_data(n, j, dsize);
memcpy(tmp->value + k * dsize, data, dsize);
- set_bit(j, tmp->used);
+ set_bit(k, tmp->used);
k++;
}
tmp->pos = k;
+ set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
rcu_assign_pointer(hbucket(t, key), tmp);
kfree_rcu(n, rcu);
}
@@ -957,14 +945,13 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
int i, j = 0;
#endif
u32 key, multi = 0;
- u8 nets_length = NLEN(set->family);
pr_debug("test by nets\n");
- for (; j < nets_length && h->nets[j].cidr[0] && !multi; j++) {
+ for (; j < NLEN && h->nets[j].cidr[0] && !multi; j++) {
#if IPSET_NET_COUNT == 2
mtype_data_reset_elem(d, &orig);
mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]), false);
- for (k = 0; k < nets_length && h->nets[k].cidr[1] && !multi;
+ for (k = 0; k < NLEN && h->nets[k].cidr[1] && !multi;
k++) {
mtype_data_netmask(d, NCIDR_GET(h->nets[k].cidr[1]),
true);
@@ -1021,7 +1008,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
* try all possible network sizes
*/
for (i = 0; i < IPSET_NET_COUNT; i++)
- if (DCIDR_GET(d->cidr, i) != SET_HOST_MASK(set->family))
+ if (DCIDR_GET(d->cidr, i) != HOST_MASK)
break;
if (i == IPSET_NET_COUNT) {
ret = mtype_test_cidrs(set, d, ext, mext, flags);
@@ -1062,7 +1049,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
rcu_read_lock_bh();
t = rcu_dereference_bh_nfnl(h->table);
- memsize = mtype_ahash_memsize(h, t, NLEN(set->family), set->dsize);
+ memsize = mtype_ahash_memsize(h, t) + set->ext_size;
htable_bits = t->htable_bits;
rcu_read_unlock_bh();
@@ -1083,7 +1070,8 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
goto nla_put_failure;
#endif
if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
- nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
+ nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
goto nla_put_failure;
@@ -1238,41 +1226,35 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
struct htype *h;
struct htable *t;
+ pr_debug("Create set %s with family %s\n",
+ set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
+
#ifndef IP_SET_PROTO_UNDEF
if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
return -IPSET_ERR_INVALID_FAMILY;
#endif
-#ifdef IP_SET_HASH_WITH_MARKMASK
- markmask = 0xffffffff;
-#endif
-#ifdef IP_SET_HASH_WITH_NETMASK
- netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
- pr_debug("Create set %s with family %s\n",
- set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
-#endif
-
if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
+
#ifdef IP_SET_HASH_WITH_MARKMASK
/* Separated condition in order to avoid directive in argument list */
if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK)))
return -IPSET_ERR_PROTOCOL;
-#endif
- if (tb[IPSET_ATTR_HASHSIZE]) {
- hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
- if (hashsize < IPSET_MIMINAL_HASHSIZE)
- hashsize = IPSET_MIMINAL_HASHSIZE;
+ markmask = 0xffffffff;
+ if (tb[IPSET_ATTR_MARKMASK]) {
+ markmask = ntohl(nla_get_be32(tb[IPSET_ATTR_MARKMASK]));
+ if (markmask == 0)
+ return -IPSET_ERR_INVALID_MARKMASK;
}
-
- if (tb[IPSET_ATTR_MAXELEM])
- maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+#endif
#ifdef IP_SET_HASH_WITH_NETMASK
+ netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
if (tb[IPSET_ATTR_NETMASK]) {
netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
@@ -1282,33 +1264,21 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
return -IPSET_ERR_INVALID_NETMASK;
}
#endif
-#ifdef IP_SET_HASH_WITH_MARKMASK
- if (tb[IPSET_ATTR_MARKMASK]) {
- markmask = ntohl(nla_get_be32(tb[IPSET_ATTR_MARKMASK]));
- if (markmask == 0)
- return -IPSET_ERR_INVALID_MARKMASK;
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
}
-#endif
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
hsize = sizeof(*h);
-#ifdef IP_SET_HASH_WITH_NETS
- hsize += sizeof(struct net_prefixes) * NLEN(set->family);
-#endif
h = kzalloc(hsize, GFP_KERNEL);
if (!h)
return -ENOMEM;
- h->maxelem = maxelem;
-#ifdef IP_SET_HASH_WITH_NETMASK
- h->netmask = netmask;
-#endif
-#ifdef IP_SET_HASH_WITH_MARKMASK
- h->markmask = markmask;
-#endif
- get_random_bytes(&h->initval, sizeof(h->initval));
- set->timeout = IPSET_NO_TIMEOUT;
-
hbits = htable_bits(hashsize);
hsize = htable_size(hbits);
if (hsize == 0) {
@@ -1320,8 +1290,17 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
kfree(h);
return -ENOMEM;
}
+ h->maxelem = maxelem;
+#ifdef IP_SET_HASH_WITH_NETMASK
+ h->netmask = netmask;
+#endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ h->markmask = markmask;
+#endif
+ get_random_bytes(&h->initval, sizeof(h->initval));
+
t->htable_bits = hbits;
- rcu_assign_pointer(h->table, t);
+ RCU_INIT_POINTER(h->table, t);
set->data = h;
#ifndef IP_SET_PROTO_UNDEF
@@ -1339,6 +1318,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
__alignof__(struct IPSET_TOKEN(HTYPE, 6_elem)));
}
#endif
+ set->timeout = IPSET_NO_TIMEOUT;
if (tb[IPSET_ATTR_TIMEOUT]) {
set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
#ifndef IP_SET_PROTO_UNDEF
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index 9d6bf19f7b78..20bfbd315f61 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -82,7 +82,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_ip *h = set->data;
+ const struct hash_ip4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ip4_elem e = { 0 };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -101,7 +101,7 @@ static int
hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ip *h = set->data;
+ const struct hash_ip4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ip4_elem e = { 0 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -199,7 +199,7 @@ nla_put_failure:
}
static inline void
-hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e)
+hash_ip6_data_next(struct hash_ip6_elem *next, const struct hash_ip6_elem *e)
{
}
@@ -217,7 +217,7 @@ hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_ip *h = set->data;
+ const struct hash_ip6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ip6_elem e = { { .all = { 0 } } };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -234,7 +234,7 @@ static int
hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ip *h = set->data;
+ const struct hash_ip6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ip6_elem e = { { .all = { 0 } } };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c
new file mode 100644
index 000000000000..1ab5ed2f6839
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipmac.c
@@ -0,0 +1,315 @@
+/* Copyright (C) 2016 Tomasz Chilinski <tomasz.chilinski@chilan.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,mac type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <linux/if_ether.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+#define IPSET_TYPE_REV_MAX 0
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Tomasz Chilinski <tomasz.chilinski@chilan.com>");
+IP_SET_MODULE_DESC("hash:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip,mac");
+
+/* Type specific function prefix */
+#define HTYPE hash_ipmac
+
+/* Zero valued element is not supported */
+static const unsigned char invalid_ether[ETH_ALEN] = { 0 };
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ipmac4_elem {
+ /* Zero valued IP addresses cannot be stored */
+ __be32 ip;
+ union {
+ unsigned char ether[ETH_ALEN];
+ __be32 foo[2];
+ };
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipmac4_data_equal(const struct hash_ipmac4_elem *e1,
+ const struct hash_ipmac4_elem *e2,
+ u32 *multi)
+{
+ return e1->ip == e2->ip && ether_addr_equal(e1->ether, e2->ether);
+}
+
+static bool
+hash_ipmac4_data_list(struct sk_buff *skb, const struct hash_ipmac4_elem *e)
+{
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip) ||
+ nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether))
+ goto nla_put_failure;
+ return false;
+
+nla_put_failure:
+ return true;
+}
+
+static inline void
+hash_ipmac4_data_next(struct hash_ipmac4_elem *next,
+ const struct hash_ipmac4_elem *e)
+{
+ next->ip = e->ip;
+}
+
+#define MTYPE hash_ipmac4
+#define PF 4
+#define HOST_MASK 32
+#define HKEY_DATALEN sizeof(struct hash_ipmac4_elem)
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipmac4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmac4_elem e = { .ip = 0, { .foo[0] = 0, .foo[1] = 0 } };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ /* MAC can be src only */
+ if (!(opt->flags & IPSET_DIM_TWO_SRC))
+ return 0;
+
+ if (skb_mac_header(skb) < skb->head ||
+ (skb_mac_header(skb) + ETH_HLEN) > skb->data)
+ return -EINVAL;
+
+ memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN);
+ if (ether_addr_equal(e.ether, invalid_ether))
+ return -EINVAL;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipmac4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmac4_elem e = { .ip = 0, { .foo[0] = 0, .foo[1] = 0 } };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !tb[IPSET_ATTR_ETHER] ||
+ nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+ memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
+ if (ether_addr_equal(e.ether, invalid_ether))
+ return -IPSET_ERR_HASH_ELEM;
+
+ return adtfn(set, &e, &ext, &ext, flags);
+}
+
+/* IPv6 variant */
+
+/* Member elements */
+struct hash_ipmac6_elem {
+ /* Zero valued IP addresses cannot be stored */
+ union nf_inet_addr ip;
+ union {
+ unsigned char ether[ETH_ALEN];
+ __be32 foo[2];
+ };
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipmac6_data_equal(const struct hash_ipmac6_elem *e1,
+ const struct hash_ipmac6_elem *e2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&e1->ip.in6, &e2->ip.in6) &&
+ ether_addr_equal(e1->ether, e2->ether);
+}
+
+static bool
+hash_ipmac6_data_list(struct sk_buff *skb, const struct hash_ipmac6_elem *e)
+{
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) ||
+ nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether))
+ goto nla_put_failure;
+ return false;
+
+nla_put_failure:
+ return true;
+}
+
+static inline void
+hash_ipmac6_data_next(struct hash_ipmac6_elem *next,
+ const struct hash_ipmac6_elem *e)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+#undef HKEY_DATALEN
+
+#define MTYPE hash_ipmac6
+#define PF 6
+#define HOST_MASK 128
+#define HKEY_DATALEN sizeof(struct hash_ipmac6_elem)
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipmac6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmac6_elem e = {
+ { .all = { 0 } },
+ { .foo[0] = 0, .foo[1] = 0 }
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ /* MAC can be src only */
+ if (!(opt->flags & IPSET_DIM_TWO_SRC))
+ return 0;
+
+ if (skb_mac_header(skb) < skb->head ||
+ (skb_mac_header(skb) + ETH_HLEN) > skb->data)
+ return -EINVAL;
+
+ memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN);
+ if (ether_addr_equal(e.ether, invalid_ether))
+ return -EINVAL;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipmac6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmac6_elem e = {
+ { .all = { 0 } },
+ { .foo[0] = 0, .foo[1] = 0 }
+ };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !tb[IPSET_ATTR_ETHER] ||
+ nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
+ if (ether_addr_equal(e.ether, invalid_ether))
+ return -IPSET_ERR_HASH_ELEM;
+
+ return adtfn(set, &e, &ext, &ext, flags);
+}
+
+static struct ip_set_type hash_ipmac_type __read_mostly = {
+ .name = "hash:ip,mac",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_MAC,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_ipmac_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_ETHER] = { .type = NLA_BINARY,
+ .len = ETH_ALEN },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
+ [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
+ [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipmac_init(void)
+{
+ return ip_set_type_register(&hash_ipmac_type);
+}
+
+static void __exit
+hash_ipmac_fini(void)
+{
+ ip_set_type_unregister(&hash_ipmac_type);
+}
+
+module_init(hash_ipmac_init);
+module_exit(hash_ipmac_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
index a0695a2ab585..b64cf14e8352 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -85,7 +85,7 @@ hash_ipmark4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_ipmark *h = set->data;
+ const struct hash_ipmark4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipmark4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -101,7 +101,7 @@ static int
hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipmark *h = set->data;
+ const struct hash_ipmark4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipmark4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -193,7 +193,7 @@ nla_put_failure:
}
static inline void
-hash_ipmark6_data_next(struct hash_ipmark4_elem *next,
+hash_ipmark6_data_next(struct hash_ipmark6_elem *next,
const struct hash_ipmark6_elem *d)
{
}
@@ -211,7 +211,7 @@ hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_ipmark *h = set->data;
+ const struct hash_ipmark6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipmark6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -227,7 +227,7 @@ static int
hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipmark *h = set->data;
+ const struct hash_ipmark6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipmark6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 9d84b3dff603..f438740e6c6a 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -108,7 +108,7 @@ static int
hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipport *h = set->data;
+ const struct hash_ipport4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipport4_elem e = { .ip = 0 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -231,7 +231,7 @@ nla_put_failure:
}
static inline void
-hash_ipport6_data_next(struct hash_ipport4_elem *next,
+hash_ipport6_data_next(struct hash_ipport6_elem *next,
const struct hash_ipport6_elem *d)
{
next->port = d->port;
@@ -266,7 +266,7 @@ static int
hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipport *h = set->data;
+ const struct hash_ipport6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipport6_elem e = { .ip = { .all = { 0 } } };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index 215b7b942038..6215fb898c50 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -111,7 +111,7 @@ static int
hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipportip *h = set->data;
+ const struct hash_ipportip4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportip4_elem e = { .ip = 0 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -241,7 +241,7 @@ nla_put_failure:
}
static inline void
-hash_ipportip6_data_next(struct hash_ipportip4_elem *next,
+hash_ipportip6_data_next(struct hash_ipportip6_elem *next,
const struct hash_ipportip6_elem *d)
{
next->port = d->port;
@@ -277,7 +277,7 @@ static int
hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipportip *h = set->data;
+ const struct hash_ipportip6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 9ca719625ea3..5ab1b99a53c2 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -138,7 +138,7 @@ hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_ipportnet *h = set->data;
+ const struct hash_ipportnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportnet4_elem e = {
.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -163,7 +163,7 @@ static int
hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipportnet *h = set->data;
+ const struct hash_ipportnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -370,7 +370,7 @@ nla_put_failure:
}
static inline void
-hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next,
+hash_ipportnet6_data_next(struct hash_ipportnet6_elem *next,
const struct hash_ipportnet6_elem *d)
{
next->port = d->port;
@@ -389,7 +389,7 @@ hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_ipportnet *h = set->data;
+ const struct hash_ipportnet6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportnet6_elem e = {
.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -414,7 +414,7 @@ static int
hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipportnet *h = set->data;
+ const struct hash_ipportnet6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 3e4bffdc1cc0..5d9e895452e7 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -117,7 +117,7 @@ hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_net *h = set->data;
+ const struct hash_net4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_net4_elem e = {
.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -139,7 +139,7 @@ static int
hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_net *h = set->data;
+ const struct hash_net4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_net4_elem e = { .cidr = HOST_MASK };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -268,7 +268,7 @@ nla_put_failure:
}
static inline void
-hash_net6_data_next(struct hash_net4_elem *next,
+hash_net6_data_next(struct hash_net6_elem *next,
const struct hash_net6_elem *d)
{
}
@@ -286,7 +286,7 @@ hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_net *h = set->data;
+ const struct hash_net6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_net6_elem e = {
.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index f0f688db6213..44cf11939c91 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -156,7 +156,7 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- struct hash_netiface *h = set->data;
+ struct hash_netiface4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netiface4_elem e = {
.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -170,7 +170,7 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
e.ip &= ip_set_netmask(e.cidr);
-#define IFACE(dir) (par->dir ? par->dir->name : "")
+#define IFACE(dir) (par->state->dir ? par->state->dir->name : "")
#define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC)
if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
@@ -196,7 +196,7 @@ static int
hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- struct hash_netiface *h = set->data;
+ struct hash_netiface4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -348,7 +348,7 @@ nla_put_failure:
}
static inline void
-hash_netiface6_data_next(struct hash_netiface4_elem *next,
+hash_netiface6_data_next(struct hash_netiface6_elem *next,
const struct hash_netiface6_elem *d)
{
}
@@ -367,7 +367,7 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- struct hash_netiface *h = set->data;
+ struct hash_netiface6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netiface6_elem e = {
.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index a93dfebffa81..db614e13b193 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -143,7 +143,7 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_netnet *h = set->data;
+ const struct hash_netnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -165,7 +165,7 @@ static int
hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_netnet *h = set->data;
+ const struct hash_netnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -352,7 +352,7 @@ nla_put_failure:
}
static inline void
-hash_netnet6_data_next(struct hash_netnet4_elem *next,
+hash_netnet6_data_next(struct hash_netnet6_elem *next,
const struct hash_netnet6_elem *d)
{
}
@@ -377,7 +377,7 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_netnet *h = set->data;
+ const struct hash_netnet6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 731813e0f08c..54b64b6cd0cd 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -133,7 +133,7 @@ hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_netport *h = set->data;
+ const struct hash_netport4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netport4_elem e = {
.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -157,7 +157,7 @@ static int
hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_netport *h = set->data;
+ const struct hash_netport4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -329,7 +329,7 @@ nla_put_failure:
}
static inline void
-hash_netport6_data_next(struct hash_netport4_elem *next,
+hash_netport6_data_next(struct hash_netport6_elem *next,
const struct hash_netport6_elem *d)
{
next->port = d->port;
@@ -348,7 +348,7 @@ hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_netport *h = set->data;
+ const struct hash_netport6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netport6_elem e = {
.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -372,7 +372,7 @@ static int
hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_netport *h = set->data;
+ const struct hash_netport6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netport6_elem e = { .cidr = HOST_MASK - 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 9a14c237830f..aff846960ac4 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -154,7 +154,7 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_netportnet *h = set->data;
+ const struct hash_netportnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netportnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -180,7 +180,7 @@ static int
hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_netportnet *h = set->data;
+ const struct hash_netportnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netportnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -406,7 +406,7 @@ nla_put_failure:
}
static inline void
-hash_netportnet6_data_next(struct hash_netportnet4_elem *next,
+hash_netportnet6_data_next(struct hash_netportnet6_elem *next,
const struct hash_netportnet6_elem *d)
{
next->port = d->port;
@@ -432,7 +432,7 @@ hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct hash_netportnet *h = set->data;
+ const struct hash_netportnet6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netportnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -458,7 +458,7 @@ static int
hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_netportnet *h = set->data;
+ const struct hash_netportnet6 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netportnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index a2a89e4e0a14..178d4eba013b 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -166,6 +166,7 @@ __list_set_del_rcu(struct rcu_head * rcu)
static inline void
list_set_del(struct ip_set *set, struct set_elem *e)
{
+ set->elements--;
list_del_rcu(&e->list);
call_rcu(&e->rcu, __list_set_del_rcu);
}
@@ -227,7 +228,7 @@ list_set_init_extensions(struct ip_set *set, const struct ip_set_ext *ext,
if (SET_WITH_COUNTER(set))
ip_set_init_counter(ext_counter(e, set), ext);
if (SET_WITH_COMMENT(set))
- ip_set_init_comment(ext_comment(e, set), ext);
+ ip_set_init_comment(set, ext_comment(e, set), ext);
if (SET_WITH_SKBINFO(set))
ip_set_init_skbinfo(ext_skbinfo(e, set), ext);
/* Update timeout last */
@@ -259,11 +260,14 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
else
prev = e;
}
+
+ /* If before/after is used on an empty set */
+ if ((d->before > 0 && !next) ||
+ (d->before < 0 && !prev))
+ return -IPSET_ERR_REF_EXIST;
+
/* Re-add already existing element */
if (n) {
- if ((d->before > 0 && !next) ||
- (d->before < 0 && !prev))
- return -IPSET_ERR_REF_EXIST;
if (!flag_exist)
return -IPSET_ERR_EXIST;
/* Update extensions */
@@ -309,6 +313,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
list_add_rcu(&e->list, &prev->list);
else
list_add_tail_rcu(&e->list, &map->members);
+ set->elements++;
return 0;
}
@@ -419,6 +424,8 @@ list_set_flush(struct ip_set *set)
list_for_each_entry_safe(e, n, &map->members, list)
list_set_del(set, e);
+ set->elements = 0;
+ set->ext_size = 0;
}
static void
@@ -441,12 +448,12 @@ list_set_destroy(struct ip_set *set)
set->data = NULL;
}
-static int
-list_set_head(struct ip_set *set, struct sk_buff *skb)
+/* Calculate the actual memory size of the set data */
+static size_t
+list_set_memsize(const struct list_set *map, size_t dsize)
{
- const struct list_set *map = set->data;
- struct nlattr *nested;
struct set_elem *e;
+ size_t memsize;
u32 n = 0;
rcu_read_lock();
@@ -454,13 +461,25 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
n++;
rcu_read_unlock();
+ memsize = sizeof(*map) + n * dsize;
+
+ return memsize;
+}
+
+static int
+list_set_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct list_set *map = set->data;
+ struct nlattr *nested;
+ size_t memsize = list_set_memsize(map, set->dsize) + set->ext_size;
+
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
goto nla_put_failure;
if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) ||
nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
- nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
- htonl(sizeof(*map) + n * set->dsize)))
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
+ nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
goto nla_put_failure;
@@ -570,11 +589,8 @@ list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
{
struct list_set *map = set->data;
- init_timer(&map->gc);
- map->gc.data = (unsigned long)set;
- map->gc.function = gc;
- map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
- add_timer(&map->gc);
+ setup_timer(&map->gc, gc, (unsigned long)set);
+ mod_timer(&map->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
}
/* Create list:set type of sets */
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 096a45103f14..e6a2753dff9e 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1429,7 +1429,7 @@ int __init ip_vs_conn_init(void)
"(size=%d, memory=%ldKbytes)\n",
ip_vs_conn_tab_size,
(long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
- IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
+ IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n",
sizeof(struct ip_vs_conn));
for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 2c1b498a7a27..db40050f8785 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -70,7 +70,7 @@ EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
EXPORT_SYMBOL(ip_vs_new_conn_out);
-static int ip_vs_net_id __read_mostly;
+static unsigned int ip_vs_net_id __read_mostly;
/* netns cnt used for uniqueness */
static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index a6e44ef2ec9a..5aeb0dde6ccc 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -48,7 +48,7 @@
#include <net/sock.h>
#include <net/genetlink.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/ip_vs.h>
@@ -426,10 +426,9 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol
*/
svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport);
- if (svc == NULL
- && protocol == IPPROTO_TCP
- && atomic_read(&ipvs->ftpsvc_counter)
- && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
+ if (!svc && protocol == IPPROTO_TCP &&
+ atomic_read(&ipvs->ftpsvc_counter) &&
+ (vport == FTPDATA || ntohs(vport) >= inet_prot_sock(ipvs->net))) {
/*
* Check if ftp service entry exists, the packet
* might belong to FTP data connections.
@@ -711,7 +710,6 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
dest->vport == svc->port))) {
/* HIT */
list_del(&dest->t_list);
- ip_vs_dest_hold(dest);
goto out;
}
}
@@ -741,7 +739,7 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest)
* When the ip_vs_control_clearup is activated by ipvs module exit,
* the service tables must have been flushed and all the connections
* are expired, and the refcnt of each destination in the trash must
- * be 0, so we simply release them here.
+ * be 1, so we simply release them here.
*/
static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
{
@@ -1080,11 +1078,10 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
if (list_empty(&ipvs->dest_trash) && !cleanup)
mod_timer(&ipvs->dest_trash_timer,
jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
- /* dest lives in trash without reference */
+ /* dest lives in trash with reference */
list_add(&dest->t_list, &ipvs->dest_trash);
dest->idle_start = 0;
spin_unlock_bh(&ipvs->dest_trash_lock);
- ip_vs_dest_put(dest);
}
@@ -1160,7 +1157,7 @@ static void ip_vs_dest_trash_expire(unsigned long data)
spin_lock(&ipvs->dest_trash_lock);
list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
- if (atomic_read(&dest->refcnt) > 0)
+ if (atomic_read(&dest->refcnt) > 1)
continue;
if (dest->idle_start) {
if (time_before(now, dest->idle_start +
@@ -2840,14 +2837,7 @@ static struct nf_sockopt_ops ip_vs_sockopts = {
*/
/* IPVS genetlink family */
-static struct genl_family ip_vs_genl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = IPVS_GENL_NAME,
- .version = IPVS_GENL_VERSION,
- .maxattr = IPVS_CMD_ATTR_MAX,
- .netnsok = true, /* Make ipvsadm to work on netns */
-};
+static struct genl_family ip_vs_genl_family;
/* Policy used for first-level command attributes */
static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
@@ -3267,7 +3257,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]);
- if (IS_ERR(svc) || svc == NULL)
+ if (IS_ERR_OR_NULL(svc))
goto out_err;
/* Dump the destinations */
@@ -3872,10 +3862,20 @@ static const struct genl_ops ip_vs_genl_ops[] = {
},
};
+static struct genl_family ip_vs_genl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = IPVS_GENL_NAME,
+ .version = IPVS_GENL_VERSION,
+ .maxattr = IPVS_CMD_ATTR_MAX,
+ .netnsok = true, /* Make ipvsadm to work on netns */
+ .module = THIS_MODULE,
+ .ops = ip_vs_genl_ops,
+ .n_ops = ARRAY_SIZE(ip_vs_genl_ops),
+};
+
static int __init ip_vs_genl_register(void)
{
- return genl_register_family_with_ops(&ip_vs_genl_family,
- ip_vs_genl_ops);
+ return genl_register_family(&ip_vs_genl_family);
}
static void ip_vs_genl_unregister(void)
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 6be5c538b71e..75f798f8e83b 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -163,7 +163,7 @@ static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
return -ENOMEM;
svc->sched_data = s;
- IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
+ IP_VS_DBG(6, "DH hash table (memory=%zdbytes) allocated for "
"current service\n",
sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
@@ -183,7 +183,7 @@ static void ip_vs_dh_done_svc(struct ip_vs_service *svc)
/* release the table itself */
kfree_rcu(s, rcu_head);
- IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
+ IP_VS_DBG(6, "DH hash table (memory=%zdbytes) released\n",
sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
}
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index cccf4d637412..5824927cf8e0 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -356,7 +356,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
return -ENOMEM;
svc->sched_data = tbl;
- IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
+ IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) allocated for "
"current service\n", sizeof(*tbl));
/*
@@ -393,7 +393,7 @@ static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
/* release the table itself */
kfree_rcu(tbl, rcu_head);
- IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
+ IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) released\n",
sizeof(*tbl));
}
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 796d70e47ddd..703f11877bee 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -519,7 +519,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
return -ENOMEM;
svc->sched_data = tbl;
- IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
+ IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) allocated for "
"current service\n", sizeof(*tbl));
/*
@@ -556,7 +556,7 @@ static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
/* release the table itself */
kfree_rcu(tbl, rcu_head);
- IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
+ IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) released\n",
sizeof(*tbl));
}
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 1e373a5e44e3..16aaac6eedc9 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -239,7 +239,7 @@ static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
return -ENOMEM;
svc->sched_data = s;
- IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
+ IP_VS_DBG(6, "SH hash table (memory=%zdbytes) allocated for "
"current service\n",
sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
@@ -259,7 +259,7 @@ static void ip_vs_sh_done_svc(struct ip_vs_service *svc)
/* release the table itself */
kfree_rcu(s, rcu_head);
- IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
+ IP_VS_DBG(6, "SH hash table (memory=%zdbytes) released\n",
sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
}
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 9350530c16c1..b03c28084f81 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1791,7 +1791,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
u16 mtu, min_mtu;
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
- IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
+ IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n",
sizeof(struct ip_vs_sync_conn_v0));
if (!ipvs->sync_state) {
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 01d3d894de46..4e1a98fcc8c3 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -254,6 +254,54 @@ static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
return true;
}
+static inline bool decrement_ttl(struct netns_ipvs *ipvs,
+ int skb_af,
+ struct sk_buff *skb)
+{
+ struct net *net = ipvs->net;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (skb_af == AF_INET6) {
+ struct dst_entry *dst = skb_dst(skb);
+
+ /* check and decrement ttl */
+ if (ipv6_hdr(skb)->hop_limit <= 1) {
+ /* Force OUTPUT device used as source address */
+ skb->dev = dst->dev;
+ icmpv6_send(skb, ICMPV6_TIME_EXCEED,
+ ICMPV6_EXC_HOPLIMIT, 0);
+ __IP6_INC_STATS(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_INHDRERRORS);
+
+ return false;
+ }
+
+ /* don't propagate ttl change to cloned packets */
+ if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+ return false;
+
+ ipv6_hdr(skb)->hop_limit--;
+ } else
+#endif
+ {
+ if (ip_hdr(skb)->ttl <= 1) {
+ /* Tell the sender its packet died... */
+ __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
+ icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+ return false;
+ }
+
+ /* don't propagate ttl change to cloned packets */
+ if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ return false;
+
+ /* Decrease ttl */
+ ip_decrease_ttl(ip_hdr(skb));
+ }
+
+ return true;
+}
+
/* Get route to destination or remote server */
static int
__ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
@@ -326,6 +374,9 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
return local;
}
+ if (!decrement_ttl(ipvs, skb_af, skb))
+ goto err_put;
+
if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
mtu = dst_mtu(&rt->dst);
} else {
@@ -473,6 +524,9 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
return local;
}
+ if (!decrement_ttl(ipvs, skb_af, skb))
+ goto err_put;
+
/* MTU checking */
if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
mtu = dst_mtu(&rt->dst);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0f87e5d21be7..ffb78e5f7b70 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -85,11 +85,11 @@ static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
static __read_mostly bool nf_conntrack_locks_all;
/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */
-#define GC_MAX_BUCKETS_DIV 64u
-/* upper bound of scan intervals */
-#define GC_INTERVAL_MAX (2 * HZ)
-/* maximum conntracks to evict per gc run */
-#define GC_MAX_EVICTS 256u
+#define GC_MAX_BUCKETS_DIV 128u
+/* upper bound of full table scan */
+#define GC_MAX_SCAN_JIFFIES (16u * HZ)
+/* desired ratio of entries found to be expired */
+#define GC_EVICT_RATIO 50u
static struct conntrack_gc_work conntrack_gc_work;
@@ -181,7 +181,11 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
unsigned int nf_conntrack_max __read_mostly;
seqcount_t nf_conntrack_generation __read_mostly;
-DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
+/* nf_conn must be 8 bytes aligned, as the 3 LSB bits are used
+ * for the nfctinfo. We cheat by (ab)using the PER CPU cache line
+ * alignment to enforce this.
+ */
+DEFINE_PER_CPU_ALIGNED(struct nf_conn, nf_conntrack_untracked);
EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
static unsigned int nf_conntrack_hash_rnd __read_mostly;
@@ -350,16 +354,31 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
spin_unlock(&pcpu->lock);
}
+#define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
+
/* Released via destroy_conntrack() */
struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
const struct nf_conntrack_zone *zone,
gfp_t flags)
{
- struct nf_conn *tmpl;
+ struct nf_conn *tmpl, *p;
- tmpl = kzalloc(sizeof(*tmpl), flags);
- if (tmpl == NULL)
- return NULL;
+ if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) {
+ tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags);
+ if (!tmpl)
+ return NULL;
+
+ p = tmpl;
+ tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
+ if (tmpl != p) {
+ tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
+ tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
+ }
+ } else {
+ tmpl = kzalloc(sizeof(*tmpl), flags);
+ if (!tmpl)
+ return NULL;
+ }
tmpl->status = IPS_TEMPLATE;
write_pnet(&tmpl->ct_net, net);
@@ -374,7 +393,11 @@ void nf_ct_tmpl_free(struct nf_conn *tmpl)
{
nf_ct_ext_destroy(tmpl);
nf_ct_ext_free(tmpl);
- kfree(tmpl);
+
+ if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
+ kfree((char *)tmpl - tmpl->proto.tmpl_padto);
+ else
+ kfree(tmpl);
}
EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
@@ -686,12 +709,12 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
!nfct_nat(ct) &&
!nf_ct_is_dying(ct) &&
atomic_inc_not_zero(&ct->ct_general.use)) {
- nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct);
- nf_conntrack_put(skb->nfct);
- /* Assign conntrack already in hashes to this skbuff. Don't
- * modify skb->nfctinfo to ensure consistent stateful filtering.
- */
- skb->nfct = &ct->ct_general;
+ enum ip_conntrack_info oldinfo;
+ struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
+
+ nf_ct_acct_merge(ct, ctinfo, loser_ct);
+ nf_conntrack_put(&loser_ct->ct_general);
+ nf_ct_set(skb, ct, oldinfo);
return NF_ACCEPT;
}
NF_CT_STAT_INC(net, drop);
@@ -783,7 +806,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
/* set conntrack timestamp, if enabled. */
tstamp = nf_conn_tstamp_find(ct);
if (tstamp) {
- if (skb->tstamp.tv64 == 0)
+ if (skb->tstamp == 0)
__net_timestamp(skb);
tstamp->start = ktime_to_ns(skb->tstamp);
@@ -938,6 +961,7 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
static void gc_worker(struct work_struct *work)
{
+ unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
unsigned int i, goal, buckets = 0, expired_count = 0;
struct conntrack_gc_work *gc_work;
unsigned int ratio, scanned = 0;
@@ -979,8 +1003,7 @@ static void gc_worker(struct work_struct *work)
*/
rcu_read_unlock();
cond_resched_rcu_qs();
- } while (++buckets < goal &&
- expired_count < GC_MAX_EVICTS);
+ } while (++buckets < goal);
if (gc_work->exiting)
return;
@@ -997,27 +1020,25 @@ static void gc_worker(struct work_struct *work)
* 1. Minimize time until we notice a stale entry
* 2. Maximize scan intervals to not waste cycles
*
- * Normally, expired_count will be 0, this increases the next_run time
- * to priorize 2) above.
+ * Normally, expire ratio will be close to 0.
*
- * As soon as a timed-out entry is found, move towards 1) and increase
- * the scan frequency.
- * In case we have lots of evictions next scan is done immediately.
+ * As soon as a sizeable fraction of the entries have expired
+ * increase scan frequency.
*/
ratio = scanned ? expired_count * 100 / scanned : 0;
- if (ratio >= 90 || expired_count == GC_MAX_EVICTS) {
- gc_work->next_gc_run = 0;
- next_run = 0;
- } else if (expired_count) {
- gc_work->next_gc_run /= 2U;
- next_run = msecs_to_jiffies(1);
+ if (ratio > GC_EVICT_RATIO) {
+ gc_work->next_gc_run = min_interval;
} else {
- if (gc_work->next_gc_run < GC_INTERVAL_MAX)
- gc_work->next_gc_run += msecs_to_jiffies(1);
+ unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV;
- next_run = gc_work->next_gc_run;
+ BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0);
+
+ gc_work->next_gc_run += min_interval;
+ if (gc_work->next_gc_run > max)
+ gc_work->next_gc_run = max;
}
+ next_run = gc_work->next_gc_run;
gc_work->last_bucket = i;
queue_delayed_work(system_long_wq, &gc_work->dwork, next_run);
}
@@ -1025,7 +1046,7 @@ static void gc_worker(struct work_struct *work)
static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
{
INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
- gc_work->next_gc_run = GC_INTERVAL_MAX;
+ gc_work->next_gc_run = HZ;
gc_work->exiting = false;
}
@@ -1220,7 +1241,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
}
-/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
+/* On success, returns conntrack ptr, sets skb->_nfct | ctinfo */
static inline struct nf_conn *
resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
struct sk_buff *skb,
@@ -1279,8 +1300,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
}
*set_reply = 0;
}
- skb->nfct = &ct->ct_general;
- skb->nfctinfo = *ctinfo;
+ nf_ct_set(skb, ct, *ctinfo);
return ct;
}
@@ -1288,7 +1308,7 @@ unsigned int
nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
struct sk_buff *skb)
{
- struct nf_conn *ct, *tmpl = NULL;
+ struct nf_conn *ct, *tmpl;
enum ip_conntrack_info ctinfo;
struct nf_conntrack_l3proto *l3proto;
struct nf_conntrack_l4proto *l4proto;
@@ -1298,14 +1318,14 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
int set_reply = 0;
int ret;
- if (skb->nfct) {
+ tmpl = nf_ct_get(skb, &ctinfo);
+ if (tmpl) {
/* Previously seen (loopback or untracked)? Ignore. */
- tmpl = (struct nf_conn *)skb->nfct;
if (!nf_ct_is_template(tmpl)) {
NF_CT_STAT_INC_ATOMIC(net, ignore);
return NF_ACCEPT;
}
- skb->nfct = NULL;
+ skb->_nfct = 0;
}
/* rcu_read_lock()ed by nf_hook_thresh */
@@ -1326,8 +1346,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
* inverse of the return code tells to the netfilter
* core what to do with the packet. */
if (l4proto->error != NULL) {
- ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo,
- pf, hooknum);
+ ret = l4proto->error(net, tmpl, skb, dataoff, pf, hooknum);
if (ret <= 0) {
NF_CT_STAT_INC_ATOMIC(net, error);
NF_CT_STAT_INC_ATOMIC(net, invalid);
@@ -1335,10 +1354,10 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
goto out;
}
/* ICMP[v6] protocol trackers may assign one conntrack. */
- if (skb->nfct)
+ if (skb->_nfct)
goto out;
}
-
+repeat:
ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
l3proto, l4proto, &set_reply, &ctinfo);
if (!ct) {
@@ -1355,7 +1374,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
goto out;
}
- NF_CT_ASSERT(skb->nfct);
+ NF_CT_ASSERT(skb_nfct(skb));
/* Decide what timeout policy we want to apply to this flow. */
timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
@@ -1365,11 +1384,17 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
/* Invalid: inverse of the return code tells
* the netfilter core what to do */
pr_debug("nf_conntrack_in: Can't track with proto module\n");
- nf_conntrack_put(skb->nfct);
- skb->nfct = NULL;
+ nf_conntrack_put(&ct->ct_general);
+ skb->_nfct = 0;
NF_CT_STAT_INC_ATOMIC(net, invalid);
if (ret == -NF_DROP)
NF_CT_STAT_INC_ATOMIC(net, drop);
+ /* Special case: TCP tracker reports an attempt to reopen a
+ * closed/aborted connection. We have to go back and create a
+ * fresh conntrack.
+ */
+ if (ret == -NF_REPEAT)
+ goto repeat;
ret = -ret;
goto out;
}
@@ -1377,15 +1402,8 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_REPLY, ct);
out:
- if (tmpl) {
- /* Special case: we have to repeat this hook, assign the
- * template again to this packet. We assume that this packet
- * has no conntrack assigned. This is used by nf_ct_tcp. */
- if (ret == NF_REPEAT)
- skb->nfct = (struct nf_conntrack *)tmpl;
- else
- nf_ct_put(tmpl);
- }
+ if (tmpl)
+ nf_ct_put(tmpl);
return ret;
}
@@ -1525,9 +1543,8 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
ctinfo = IP_CT_RELATED;
/* Attach to new skbuff, and increment count */
- nskb->nfct = &ct->ct_general;
- nskb->nfctinfo = ctinfo;
- nf_conntrack_get(nskb->nfct);
+ nf_ct_set(nskb, ct, ctinfo);
+ nf_conntrack_get(skb_nfct(nskb));
}
/* Bring out ya dead! */
@@ -1863,7 +1880,8 @@ int nf_conntrack_init_start(void)
nf_conntrack_max = max_factor * nf_conntrack_htable_size;
nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
- sizeof(struct nf_conn), 0,
+ sizeof(struct nf_conn),
+ NFCT_INFOMASK + 1,
SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
if (!nf_conntrack_cachep)
goto err_cachep;
@@ -1918,7 +1936,7 @@ int nf_conntrack_init_start(void)
nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
conntrack_gc_work_init(&conntrack_gc_work);
- queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, GC_INTERVAL_MAX);
+ queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, HZ);
return 0;
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index da9df2d56e66..22fc32143e9c 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -290,6 +290,7 @@ void nf_conntrack_unregister_notifier(struct net *net,
BUG_ON(notify != new);
RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL);
mutex_unlock(&nf_ct_ecache_mutex);
+ /* synchronize_rcu() is called from ctnetlink_exit. */
}
EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
@@ -326,6 +327,7 @@ void nf_ct_expect_unregister_notifier(struct net *net,
BUG_ON(notify != new);
RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL);
mutex_unlock(&nf_ct_ecache_mutex);
+ /* synchronize_rcu() is called from ctnetlink_exit. */
}
EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index f8dbacf66795..d80073037856 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -57,7 +57,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
hlist_del_rcu(&exp->hnode);
net->ct.expect_count--;
- hlist_del(&exp->lnode);
+ hlist_del_rcu(&exp->lnode);
master_help->expecting[exp->class]--;
nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report);
@@ -353,7 +353,7 @@ void nf_ct_expect_put(struct nf_conntrack_expect *exp)
}
EXPORT_SYMBOL_GPL(nf_ct_expect_put);
-static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
+static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
{
struct nf_conn_help *master_help = nfct_help(exp->master);
struct nf_conntrack_helper *helper;
@@ -363,7 +363,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
/* two references : one for hash insert, one for the timer */
atomic_add(2, &exp->use);
- hlist_add_head(&exp->lnode, &master_help->expectations);
+ hlist_add_head_rcu(&exp->lnode, &master_help->expectations);
master_help->expecting[exp->class]++;
hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]);
@@ -380,7 +380,6 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
add_timer(&exp->timeout);
NF_CT_STAT_INC(net, expect_create);
- return 0;
}
/* Race with expectations being used means we could have none to find; OK. */
@@ -411,7 +410,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
struct net *net = nf_ct_exp_net(expect);
struct hlist_node *next;
unsigned int h;
- int ret = 1;
+ int ret = 0;
if (!master_help) {
ret = -ESHUTDOWN;
@@ -461,15 +460,14 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
spin_lock_bh(&nf_conntrack_expect_lock);
ret = __nf_ct_expect_check(expect);
- if (ret <= 0)
- goto out;
-
- ret = nf_ct_expect_insert(expect);
if (ret < 0)
goto out;
+
+ nf_ct_expect_insert(expect);
+
spin_unlock_bh(&nf_conntrack_expect_lock);
nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
- return ret;
+ return 0;
out:
spin_unlock_bh(&nf_conntrack_expect_lock);
return ret;
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 02bcf00c2492..008299b7f78f 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -53,7 +53,11 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id,
rcu_read_lock();
t = rcu_dereference(nf_ct_ext_types[id]);
- BUG_ON(t == NULL);
+ if (!t) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
off = ALIGN(sizeof(struct nf_ct_ext), t->align);
len = off + t->len + var_alloc_len;
alloc_size = t->alloc_size + var_alloc_len;
@@ -88,7 +92,10 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id,
rcu_read_lock();
t = rcu_dereference(nf_ct_ext_types[id]);
- BUG_ON(t == NULL);
+ if (!t) {
+ rcu_read_unlock();
+ return NULL;
+ }
newoff = ALIGN(old->len, t->align);
newlen = newoff + t->len + var_alloc_len;
@@ -175,6 +182,6 @@ void nf_ct_extend_unregister(struct nf_ct_ext_type *type)
RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL);
update_alloc_size(type);
mutex_unlock(&nf_ct_ext_type_mutex);
- rcu_barrier(); /* Wait for completion of call_rcu()'s */
+ synchronize_rcu();
}
EXPORT_SYMBOL_GPL(nf_ct_extend_unregister);
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index e3ed20060878..4aecef4a89fb 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -300,7 +300,7 @@ static int find_pattern(const char *data, size_t dlen,
{
size_t i = plen;
- pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen);
+ pr_debug("find_pattern `%s': dlen = %zu\n", pattern, dlen);
if (dlen <= plen) {
/* Short packet: try for partial? */
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 7341adf7059d..4eeb3418366a 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -158,16 +158,25 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum)
{
struct nf_conntrack_helper *h;
+ rcu_read_lock();
+
h = __nf_conntrack_helper_find(name, l3num, protonum);
#ifdef CONFIG_MODULES
if (h == NULL) {
- if (request_module("nfct-helper-%s", name) == 0)
+ rcu_read_unlock();
+ if (request_module("nfct-helper-%s", name) == 0) {
+ rcu_read_lock();
h = __nf_conntrack_helper_find(name, l3num, protonum);
+ } else {
+ return h;
+ }
}
#endif
if (h != NULL && !try_module_get(h->me))
h = NULL;
+ rcu_read_unlock();
+
return h;
}
EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get);
@@ -188,6 +197,26 @@ nf_ct_helper_ext_add(struct nf_conn *ct,
}
EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add);
+static struct nf_conntrack_helper *
+nf_ct_lookup_helper(struct nf_conn *ct, struct net *net)
+{
+ if (!net->ct.sysctl_auto_assign_helper) {
+ if (net->ct.auto_assign_helper_warned)
+ return NULL;
+ if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple))
+ return NULL;
+ pr_info("nf_conntrack: default automatic helper assignment "
+ "has been turned off for security reasons and CT-based "
+ " firewall rule not found. Use the iptables CT target "
+ "to attach helpers instead.\n");
+ net->ct.auto_assign_helper_warned = 1;
+ return NULL;
+ }
+
+ return __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+}
+
+
int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
gfp_t flags)
{
@@ -213,21 +242,14 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
}
help = nfct_help(ct);
- if (net->ct.sysctl_auto_assign_helper && helper == NULL) {
- helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
- if (unlikely(!net->ct.auto_assign_helper_warned && helper)) {
- pr_info("nf_conntrack: automatic helper "
- "assignment is deprecated and it will "
- "be removed soon. Use the iptables CT target "
- "to attach helpers instead.\n");
- net->ct.auto_assign_helper_warned = true;
- }
- }
if (helper == NULL) {
- if (help)
- RCU_INIT_POINTER(help->helper, NULL);
- return 0;
+ helper = nf_ct_lookup_helper(ct, net);
+ if (helper == NULL) {
+ if (help)
+ RCU_INIT_POINTER(help->helper, NULL);
+ return 0;
+ }
}
if (help == NULL) {
@@ -298,38 +320,36 @@ void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n)
}
EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister);
+/* Caller should hold the rcu lock */
struct nf_ct_helper_expectfn *
nf_ct_helper_expectfn_find_by_name(const char *name)
{
struct nf_ct_helper_expectfn *cur;
bool found = false;
- rcu_read_lock();
list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) {
if (!strcmp(cur->name, name)) {
found = true;
break;
}
}
- rcu_read_unlock();
return found ? cur : NULL;
}
EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_name);
+/* Caller should hold the rcu lock */
struct nf_ct_helper_expectfn *
nf_ct_helper_expectfn_find_by_symbol(const void *symbol)
{
struct nf_ct_helper_expectfn *cur;
bool found = false;
- rcu_read_lock();
list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) {
if (cur->expectfn == symbol) {
found = true;
break;
}
}
- rcu_read_unlock();
return found ? cur : NULL;
}
EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 27540455dc62..dc7dfd68fafe 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1478,14 +1478,28 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
struct nlattr *helpinfo = NULL;
int err;
- /* don't change helper of sibling connections */
- if (ct->master)
- return -EBUSY;
-
err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo);
if (err < 0)
return err;
+ /* don't change helper of sibling connections */
+ if (ct->master) {
+ /* If we try to change the helper to the same thing twice,
+ * treat the second attempt as a no-op instead of returning
+ * an error.
+ */
+ err = -EBUSY;
+ if (help) {
+ rcu_read_lock();
+ helper = rcu_dereference(help->helper);
+ if (helper && !strcmp(helper->name, helpname))
+ err = 0;
+ rcu_read_unlock();
+ }
+
+ return err;
+ }
+
if (!strcmp(helpname, "")) {
if (help && help->helper) {
/* we had a helper before ... */
@@ -1920,9 +1934,9 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
err = 0;
if (test_bit(IPS_EXPECTED_BIT, &ct->status))
- events = IPCT_RELATED;
+ events = 1 << IPCT_RELATED;
else
- events = IPCT_NEW;
+ events = 1 << IPCT_NEW;
if (cda[CTA_LABELS] &&
ctnetlink_attach_labels(ct, cda) == 0)
@@ -2270,6 +2284,30 @@ nla_put_failure:
}
static int
+ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+ unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
+ unsigned long d = ct->status ^ status;
+
+ if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+ /* SEEN_REPLY bit can only be set */
+ return -EBUSY;
+
+ if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+ /* ASSURED bit can only be set */
+ return -EBUSY;
+
+ /* This check is less strict than ctnetlink_change_status()
+ * because callers often flip IPS_EXPECTED bits when sending
+ * an NFQA_CT attribute to the kernel. So ignore the
+ * unchangeable bits but do not error out.
+ */
+ ct->status = (status & ~IPS_UNCHANGEABLE_MASK) |
+ (ct->status & IPS_UNCHANGEABLE_MASK);
+ return 0;
+}
+
+static int
ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
{
int err;
@@ -2280,7 +2318,7 @@ ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
return err;
}
if (cda[CTA_STATUS]) {
- err = ctnetlink_change_status(ct, cda);
+ err = ctnetlink_update_status(ct, cda);
if (err < 0)
return err;
}
@@ -2642,8 +2680,8 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
last = (struct nf_conntrack_expect *)cb->args[1];
for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
restart:
- hlist_for_each_entry(exp, &nf_ct_expect_hash[cb->args[0]],
- hnode) {
+ hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]],
+ hnode) {
if (l3proto && exp->tuple.src.l3num != l3proto)
continue;
@@ -2694,7 +2732,7 @@ ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
rcu_read_lock();
last = (struct nf_conntrack_expect *)cb->args[1];
restart:
- hlist_for_each_entry(exp, &help->expectations, lnode) {
+ hlist_for_each_entry_rcu(exp, &help->expectations, lnode) {
if (l3proto && exp->tuple.src.l3num != l3proto)
continue;
if (cb->args[1]) {
@@ -2756,6 +2794,12 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl,
return -ENOENT;
ct = nf_ct_tuplehash_to_ctrack(h);
+ /* No expectation linked to this connection tracking. */
+ if (!nfct_help(ct)) {
+ nf_ct_put(ct);
+ return 0;
+ }
+
c.data = ct;
err = netlink_dump_start(ctnl, skb, nlh, &c);
@@ -3100,23 +3144,27 @@ ctnetlink_create_expect(struct net *net,
return -ENOENT;
ct = nf_ct_tuplehash_to_ctrack(h);
+ rcu_read_lock();
if (cda[CTA_EXPECT_HELP_NAME]) {
const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);
helper = __nf_conntrack_helper_find(helpname, u3,
nf_ct_protonum(ct));
if (helper == NULL) {
+ rcu_read_unlock();
#ifdef CONFIG_MODULES
if (request_module("nfct-helper-%s", helpname) < 0) {
err = -EOPNOTSUPP;
goto err_ct;
}
+ rcu_read_lock();
helper = __nf_conntrack_helper_find(helpname, u3,
nf_ct_protonum(ct));
if (helper) {
err = -EAGAIN;
- goto err_ct;
+ goto err_rcu;
}
+ rcu_read_unlock();
#endif
err = -EOPNOTSUPP;
goto err_ct;
@@ -3126,11 +3174,13 @@ ctnetlink_create_expect(struct net *net,
exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask);
if (IS_ERR(exp)) {
err = PTR_ERR(exp);
- goto err_ct;
+ goto err_rcu;
}
err = nf_ct_expect_related_report(exp, portid, report);
nf_ct_expect_put(exp);
+err_rcu:
+ rcu_read_unlock();
err_ct:
nf_ct_put(ct);
return err;
@@ -3409,6 +3459,7 @@ static void __exit ctnetlink_exit(void)
#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT
RCU_INIT_POINTER(nfnl_ct_hook, NULL);
#endif
+ synchronize_rcu();
}
module_init(ctnetlink_init);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 8d2c7d8c666a..2d6ee1803415 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -125,6 +125,54 @@ void nf_ct_l3proto_module_put(unsigned short l3proto)
}
EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put);
+int nf_ct_netns_get(struct net *net, u8 nfproto)
+{
+ const struct nf_conntrack_l3proto *l3proto;
+ int ret;
+
+ might_sleep();
+
+ ret = nf_ct_l3proto_try_module_get(nfproto);
+ if (ret < 0)
+ return ret;
+
+ /* we already have a reference, can't fail */
+ rcu_read_lock();
+ l3proto = __nf_ct_l3proto_find(nfproto);
+ rcu_read_unlock();
+
+ if (!l3proto->net_ns_get)
+ return 0;
+
+ ret = l3proto->net_ns_get(net);
+ if (ret < 0)
+ nf_ct_l3proto_module_put(nfproto);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_netns_get);
+
+void nf_ct_netns_put(struct net *net, u8 nfproto)
+{
+ const struct nf_conntrack_l3proto *l3proto;
+
+ might_sleep();
+
+ /* same as nf_conntrack_netns_get(), reference assumed */
+ rcu_read_lock();
+ l3proto = __nf_ct_l3proto_find(nfproto);
+ rcu_read_unlock();
+
+ if (WARN_ON(!l3proto))
+ return;
+
+ if (l3proto->net_ns_put)
+ l3proto->net_ns_put(net);
+
+ nf_ct_l3proto_module_put(nfproto);
+}
+EXPORT_SYMBOL_GPL(nf_ct_netns_put);
+
struct nf_conntrack_l4proto *
nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num)
{
@@ -190,20 +238,19 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(nf_ct_l3proto_register);
+#ifdef CONFIG_SYSCTL
+extern unsigned int nf_conntrack_default_on;
+
int nf_ct_l3proto_pernet_register(struct net *net,
struct nf_conntrack_l3proto *proto)
{
- int ret;
-
- if (proto->init_net) {
- ret = proto->init_net(net);
- if (ret < 0)
- return ret;
- }
+ if (nf_conntrack_default_on == 0)
+ return 0;
- return 0;
+ return proto->net_ns_get ? proto->net_ns_get(net) : 0;
}
EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register);
+#endif
void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto)
{
@@ -224,6 +271,16 @@ EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister);
void nf_ct_l3proto_pernet_unregister(struct net *net,
struct nf_conntrack_l3proto *proto)
{
+ /*
+ * nf_conntrack_default_on *might* have registered hooks.
+ * ->net_ns_put must cope with more puts() than get(), i.e.
+ * if nf_conntrack_default_on was 0 at time of
+ * nf_ct_l3proto_pernet_register invocation this net_ns_put()
+ * should be a noop.
+ */
+ if (proto->net_ns_put)
+ proto->net_ns_put(net);
+
/* Remove all contrack entries for this protocol */
nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0);
}
@@ -281,15 +338,15 @@ void nf_ct_l4proto_unregister_sysctl(struct net *net,
/* FIXME: Allow NULL functions and sub in pointers to generic for
them. --RR */
-int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto)
+int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *l4proto)
{
int ret = 0;
if (l4proto->l3proto >= PF_MAX)
return -EBUSY;
- if ((l4proto->to_nlattr && !l4proto->nlattr_size)
- || (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size))
+ if ((l4proto->to_nlattr && !l4proto->nlattr_size) ||
+ (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size))
return -EINVAL;
mutex_lock(&nf_ct_proto_mutex);
@@ -307,7 +364,8 @@ int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto)
}
for (i = 0; i < MAX_NF_CT_PROTO; i++)
- RCU_INIT_POINTER(proto_array[i], &nf_conntrack_l4proto_generic);
+ RCU_INIT_POINTER(proto_array[i],
+ &nf_conntrack_l4proto_generic);
/* Before making proto_array visible to lockless readers,
* we must make sure its content is committed to memory.
@@ -335,10 +393,10 @@ out_unlock:
mutex_unlock(&nf_ct_proto_mutex);
return ret;
}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_register);
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_register_one);
-int nf_ct_l4proto_pernet_register(struct net *net,
- struct nf_conntrack_l4proto *l4proto)
+int nf_ct_l4proto_pernet_register_one(struct net *net,
+ struct nf_conntrack_l4proto *l4proto)
{
int ret = 0;
struct nf_proto_net *pn = NULL;
@@ -361,9 +419,9 @@ int nf_ct_l4proto_pernet_register(struct net *net,
out:
return ret;
}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register);
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register_one);
-void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
+void nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *l4proto)
{
BUG_ON(l4proto->l3proto >= PF_MAX);
@@ -378,10 +436,10 @@ void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
synchronize_rcu();
}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister);
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister_one);
-void nf_ct_l4proto_pernet_unregister(struct net *net,
- struct nf_conntrack_l4proto *l4proto)
+void nf_ct_l4proto_pernet_unregister_one(struct net *net,
+ struct nf_conntrack_l4proto *l4proto)
{
struct nf_proto_net *pn = NULL;
@@ -395,6 +453,66 @@ void nf_ct_l4proto_pernet_unregister(struct net *net,
/* Remove all contrack entries for this protocol */
nf_ct_iterate_cleanup(net, kill_l4proto, l4proto, 0, 0);
}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one);
+
+int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto[],
+ unsigned int num_proto)
+{
+ int ret = -EINVAL, ver;
+ unsigned int i;
+
+ for (i = 0; i < num_proto; i++) {
+ ret = nf_ct_l4proto_register_one(l4proto[i]);
+ if (ret < 0)
+ break;
+ }
+ if (i != num_proto) {
+ ver = l4proto[i]->l3proto == PF_INET6 ? 6 : 4;
+ pr_err("nf_conntrack_ipv%d: can't register %s%d proto.\n",
+ ver, l4proto[i]->name, ver);
+ nf_ct_l4proto_unregister(l4proto, i);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_register);
+
+int nf_ct_l4proto_pernet_register(struct net *net,
+ struct nf_conntrack_l4proto *l4proto[],
+ unsigned int num_proto)
+{
+ int ret = -EINVAL;
+ unsigned int i;
+
+ for (i = 0; i < num_proto; i++) {
+ ret = nf_ct_l4proto_pernet_register_one(net, l4proto[i]);
+ if (ret < 0)
+ break;
+ }
+ if (i != num_proto) {
+ pr_err("nf_conntrack_%s%d: pernet registration failed\n",
+ l4proto[i]->name,
+ l4proto[i]->l3proto == PF_INET6 ? 6 : 4);
+ nf_ct_l4proto_pernet_unregister(net, l4proto, i);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register);
+
+void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto[],
+ unsigned int num_proto)
+{
+ while (num_proto-- != 0)
+ nf_ct_l4proto_unregister_one(l4proto[num_proto]);
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister);
+
+void nf_ct_l4proto_pernet_unregister(struct net *net,
+ struct nf_conntrack_l4proto *l4proto[],
+ unsigned int num_proto)
+{
+ while (num_proto-- != 0)
+ nf_ct_l4proto_pernet_unregister_one(net, l4proto[num_proto]);
+}
EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister);
int nf_conntrack_proto_pernet_init(struct net *net)
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index a45bee52dccc..93dd1c5b7bff 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -9,7 +9,6 @@
*
*/
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/spinlock.h>
@@ -384,17 +383,9 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] =
},
};
-/* this module per-net specifics */
-static int dccp_net_id __read_mostly;
-struct dccp_net {
- struct nf_proto_net pn;
- int dccp_loose;
- unsigned int dccp_timeout[CT_DCCP_MAX + 1];
-};
-
-static inline struct dccp_net *dccp_pernet(struct net *net)
+static inline struct nf_dccp_net *dccp_pernet(struct net *net)
{
- return net_generic(net, dccp_net_id);
+ return &net->ct.nf_ct_proto.dccp;
}
static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
@@ -424,7 +415,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
unsigned int dataoff, unsigned int *timeouts)
{
struct net *net = nf_ct_net(ct);
- struct dccp_net *dn;
+ struct nf_dccp_net *dn;
struct dccp_hdr _dh, *dh;
const char *msg;
u_int8_t state;
@@ -570,7 +561,6 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
static int dccp_error(struct net *net, struct nf_conn *tmpl,
struct sk_buff *skb, unsigned int dataoff,
- enum ip_conntrack_info *ctinfo,
u_int8_t pf, unsigned int hooknum)
{
struct dccp_hdr _dh, *dh;
@@ -719,7 +709,7 @@ static int dccp_nlattr_size(void)
static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[],
struct net *net, void *data)
{
- struct dccp_net *dn = dccp_pernet(net);
+ struct nf_dccp_net *dn = dccp_pernet(net);
unsigned int *timeouts = data;
int i;
@@ -820,7 +810,7 @@ static struct ctl_table dccp_sysctl_table[] = {
#endif /* CONFIG_SYSCTL */
static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
- struct dccp_net *dn)
+ struct nf_dccp_net *dn)
{
#ifdef CONFIG_SYSCTL
if (pn->ctl_table)
@@ -850,7 +840,7 @@ static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
static int dccp_init_net(struct net *net, u_int16_t proto)
{
- struct dccp_net *dn = dccp_pernet(net);
+ struct nf_dccp_net *dn = dccp_pernet(net);
struct nf_proto_net *pn = &dn->pn;
if (!pn->users) {
@@ -868,7 +858,7 @@ static int dccp_init_net(struct net *net, u_int16_t proto)
return dccp_kmemdup_sysctl_table(net, pn, dn);
}
-static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {
+struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = {
.l3proto = AF_INET,
.l4proto = IPPROTO_DCCP,
.name = "dccp",
@@ -898,11 +888,11 @@ static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {
.nla_policy = dccp_timeout_nla_policy,
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .net_id = &dccp_net_id,
.init_net = dccp_init_net,
};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4);
-static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {
+struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 __read_mostly = {
.l3proto = AF_INET6,
.l4proto = IPPROTO_DCCP,
.name = "dccp",
@@ -932,78 +922,6 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {
.nla_policy = dccp_timeout_nla_policy,
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .net_id = &dccp_net_id,
.init_net = dccp_init_net,
};
-
-static __net_init int dccp_net_init(struct net *net)
-{
- int ret = 0;
- ret = nf_ct_l4proto_pernet_register(net, &dccp_proto4);
- if (ret < 0) {
- pr_err("nf_conntrack_dccp4: pernet registration failed.\n");
- goto out;
- }
- ret = nf_ct_l4proto_pernet_register(net, &dccp_proto6);
- if (ret < 0) {
- pr_err("nf_conntrack_dccp6: pernet registration failed.\n");
- goto cleanup_dccp4;
- }
- return 0;
-cleanup_dccp4:
- nf_ct_l4proto_pernet_unregister(net, &dccp_proto4);
-out:
- return ret;
-}
-
-static __net_exit void dccp_net_exit(struct net *net)
-{
- nf_ct_l4proto_pernet_unregister(net, &dccp_proto6);
- nf_ct_l4proto_pernet_unregister(net, &dccp_proto4);
-}
-
-static struct pernet_operations dccp_net_ops = {
- .init = dccp_net_init,
- .exit = dccp_net_exit,
- .id = &dccp_net_id,
- .size = sizeof(struct dccp_net),
-};
-
-static int __init nf_conntrack_proto_dccp_init(void)
-{
- int ret;
-
- ret = register_pernet_subsys(&dccp_net_ops);
- if (ret < 0)
- goto out_pernet;
-
- ret = nf_ct_l4proto_register(&dccp_proto4);
- if (ret < 0)
- goto out_dccp4;
-
- ret = nf_ct_l4proto_register(&dccp_proto6);
- if (ret < 0)
- goto out_dccp6;
-
- return 0;
-out_dccp6:
- nf_ct_l4proto_unregister(&dccp_proto4);
-out_dccp4:
- unregister_pernet_subsys(&dccp_net_ops);
-out_pernet:
- return ret;
-}
-
-static void __exit nf_conntrack_proto_dccp_fini(void)
-{
- nf_ct_l4proto_unregister(&dccp_proto6);
- nf_ct_l4proto_unregister(&dccp_proto4);
- unregister_pernet_subsys(&dccp_net_ops);
-}
-
-module_init(nf_conntrack_proto_dccp_init);
-module_exit(nf_conntrack_proto_dccp_fini);
-
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("DCCP connection tracking protocol helper");
-MODULE_LICENSE("GPL");
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp6);
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 9a715f88b2f1..87bb40a3feb5 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -53,7 +53,7 @@ static unsigned int gre_timeouts[GRE_CT_MAX] = {
[GRE_CT_REPLIED] = 180*HZ,
};
-static int proto_gre_net_id __read_mostly;
+static unsigned int proto_gre_net_id __read_mostly;
struct netns_proto_gre {
struct nf_proto_net nf;
rwlock_t keymap_lock;
@@ -396,7 +396,9 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = {
static int proto_gre_net_init(struct net *net)
{
int ret = 0;
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_gre4);
+
+ ret = nf_ct_l4proto_pernet_register_one(net,
+ &nf_conntrack_l4proto_gre4);
if (ret < 0)
pr_err("nf_conntrack_gre4: pernet registration failed.\n");
return ret;
@@ -404,7 +406,7 @@ static int proto_gre_net_init(struct net *net)
static void proto_gre_net_exit(struct net *net)
{
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_gre4);
+ nf_ct_l4proto_pernet_unregister_one(net, &nf_conntrack_l4proto_gre4);
nf_ct_gre_keymap_flush(net);
}
@@ -422,8 +424,7 @@ static int __init nf_ct_proto_gre_init(void)
ret = register_pernet_subsys(&proto_gre_net_ops);
if (ret < 0)
goto out_pernet;
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_gre4);
+ ret = nf_ct_l4proto_register_one(&nf_conntrack_l4proto_gre4);
if (ret < 0)
goto out_gre4;
@@ -436,7 +437,7 @@ out_pernet:
static void __exit nf_ct_proto_gre_fini(void)
{
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_gre4);
+ nf_ct_l4proto_unregister_one(&nf_conntrack_l4proto_gre4);
unregister_pernet_subsys(&proto_gre_net_ops);
}
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 982ea62606c7..33279aab583d 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -15,7 +15,6 @@
#include <linux/types.h>
#include <linux/timer.h>
#include <linux/netfilter.h>
-#include <linux/module.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/sctp.h>
@@ -23,7 +22,9 @@
#include <linux/seq_file.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
+#include <net/sctp/checksum.h>
+#include <net/netfilter/nf_log.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_ecache.h>
@@ -144,15 +145,9 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
}
};
-static int sctp_net_id __read_mostly;
-struct sctp_net {
- struct nf_proto_net pn;
- unsigned int timeouts[SCTP_CONNTRACK_MAX];
-};
-
-static inline struct sctp_net *sctp_pernet(struct net *net)
+static inline struct nf_sctp_net *sctp_pernet(struct net *net)
{
- return net_generic(net, sctp_net_id);
+ return &net->ct.nf_ct_proto.sctp;
}
static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
@@ -512,6 +507,34 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
return true;
}
+static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb,
+ unsigned int dataoff,
+ u8 pf, unsigned int hooknum)
+{
+ const struct sctphdr *sh;
+ struct sctphdr _sctph;
+ const char *logmsg;
+
+ sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
+ if (!sh) {
+ logmsg = "nf_ct_sctp: short packet ";
+ goto out_invalid;
+ }
+ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+ skb->ip_summed == CHECKSUM_NONE) {
+ if (sh->checksum != sctp_compute_cksum(skb, dataoff)) {
+ logmsg = "nf_ct_sctp: bad CRC ";
+ goto out_invalid;
+ }
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ return NF_ACCEPT;
+out_invalid:
+ if (LOG_INVALID(net, IPPROTO_SCTP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", logmsg);
+ return -NF_ACCEPT;
+}
+
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
@@ -600,7 +623,7 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[],
struct net *net, void *data)
{
unsigned int *timeouts = data;
- struct sctp_net *sn = sctp_pernet(net);
+ struct nf_sctp_net *sn = sctp_pernet(net);
int i;
/* set default SCTP timeouts. */
@@ -708,7 +731,7 @@ static struct ctl_table sctp_sysctl_table[] = {
#endif
static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
- struct sctp_net *sn)
+ struct nf_sctp_net *sn)
{
#ifdef CONFIG_SYSCTL
if (pn->ctl_table)
@@ -735,7 +758,7 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
static int sctp_init_net(struct net *net, u_int16_t proto)
{
- struct sctp_net *sn = sctp_pernet(net);
+ struct nf_sctp_net *sn = sctp_pernet(net);
struct nf_proto_net *pn = &sn->pn;
if (!pn->users) {
@@ -748,7 +771,7 @@ static int sctp_init_net(struct net *net, u_int16_t proto)
return sctp_kmemdup_sysctl_table(pn, sn);
}
-static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
+struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
.l3proto = PF_INET,
.l4proto = IPPROTO_SCTP,
.name = "sctp",
@@ -759,6 +782,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
.packet = sctp_packet,
.get_timeouts = sctp_get_timeouts,
.new = sctp_new,
+ .error = sctp_error,
.me = THIS_MODULE,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.to_nlattr = sctp_to_nlattr,
@@ -778,11 +802,11 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
.nla_policy = sctp_timeout_nla_policy,
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .net_id = &sctp_net_id,
.init_net = sctp_init_net,
};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4);
-static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
+struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
.l3proto = PF_INET6,
.l4proto = IPPROTO_SCTP,
.name = "sctp",
@@ -793,6 +817,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
.packet = sctp_packet,
.get_timeouts = sctp_get_timeouts,
.new = sctp_new,
+ .error = sctp_error,
.me = THIS_MODULE,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.to_nlattr = sctp_to_nlattr,
@@ -812,81 +837,6 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
#endif
- .net_id = &sctp_net_id,
.init_net = sctp_init_net,
};
-
-static int sctp_net_init(struct net *net)
-{
- int ret = 0;
-
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp4);
- if (ret < 0) {
- pr_err("nf_conntrack_sctp4: pernet registration failed.\n");
- goto out;
- }
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp6);
- if (ret < 0) {
- pr_err("nf_conntrack_sctp6: pernet registration failed.\n");
- goto cleanup_sctp4;
- }
- return 0;
-
-cleanup_sctp4:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4);
-out:
- return ret;
-}
-
-static void sctp_net_exit(struct net *net)
-{
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp6);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4);
-}
-
-static struct pernet_operations sctp_net_ops = {
- .init = sctp_net_init,
- .exit = sctp_net_exit,
- .id = &sctp_net_id,
- .size = sizeof(struct sctp_net),
-};
-
-static int __init nf_conntrack_proto_sctp_init(void)
-{
- int ret;
-
- ret = register_pernet_subsys(&sctp_net_ops);
- if (ret < 0)
- goto out_pernet;
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp4);
- if (ret < 0)
- goto out_sctp4;
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp6);
- if (ret < 0)
- goto out_sctp6;
-
- return 0;
-out_sctp6:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
-out_sctp4:
- unregister_pernet_subsys(&sctp_net_ops);
-out_pernet:
- return ret;
-}
-
-static void __exit nf_conntrack_proto_sctp_fini(void)
-{
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp6);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
- unregister_pernet_subsys(&sctp_net_ops);
-}
-
-module_init(nf_conntrack_proto_sctp_init);
-module_exit(nf_conntrack_proto_sctp_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kiran Kumar Immidi");
-MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP");
-MODULE_ALIAS("ip_conntrack_proto_sctp");
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp6);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 69f687740c76..b122e9dacfed 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -750,7 +750,6 @@ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
static int tcp_error(struct net *net, struct nf_conn *tmpl,
struct sk_buff *skb,
unsigned int dataoff,
- enum ip_conntrack_info *ctinfo,
u_int8_t pf,
unsigned int hooknum)
{
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 20f35ed68030..f6ebce6178ca 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -108,8 +108,60 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
return true;
}
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+static int udplite_error(struct net *net, struct nf_conn *tmpl,
+ struct sk_buff *skb,
+ unsigned int dataoff,
+ u8 pf, unsigned int hooknum)
+{
+ unsigned int udplen = skb->len - dataoff;
+ const struct udphdr *hdr;
+ struct udphdr _hdr;
+ unsigned int cscov;
+
+ /* Header is too small? */
+ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (!hdr) {
+ if (LOG_INVALID(net, IPPROTO_UDPLITE))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udplite: short packet ");
+ return -NF_ACCEPT;
+ }
+
+ cscov = ntohs(hdr->len);
+ if (cscov == 0) {
+ cscov = udplen;
+ } else if (cscov < sizeof(*hdr) || cscov > udplen) {
+ if (LOG_INVALID(net, IPPROTO_UDPLITE))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udplite: invalid checksum coverage ");
+ return -NF_ACCEPT;
+ }
+
+ /* UDPLITE mandates checksums */
+ if (!hdr->check) {
+ if (LOG_INVALID(net, IPPROTO_UDPLITE))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udplite: checksum missing ");
+ return -NF_ACCEPT;
+ }
+
+ /* Checksum invalid? Ignore. */
+ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+ nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
+ pf)) {
+ if (LOG_INVALID(net, IPPROTO_UDPLITE))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udplite: bad UDPLite checksum ");
+ return -NF_ACCEPT;
+ }
+
+ return NF_ACCEPT;
+}
+#endif
+
static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
- unsigned int dataoff, enum ip_conntrack_info *ctinfo,
+ unsigned int dataoff,
u_int8_t pf,
unsigned int hooknum)
{
@@ -290,6 +342,41 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
};
EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4);
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
+{
+ .l3proto = PF_INET,
+ .l4proto = IPPROTO_UDPLITE,
+ .name = "udplite",
+ .allow_clash = true,
+ .pkt_to_tuple = udp_pkt_to_tuple,
+ .invert_tuple = udp_invert_tuple,
+ .print_tuple = udp_print_tuple,
+ .packet = udp_packet,
+ .get_timeouts = udp_get_timeouts,
+ .new = udp_new,
+ .error = udplite_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = udp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = udp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_UDP_MAX,
+ .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
+ .nla_policy = udp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = udp_init_net,
+ .get_net_proto = udp_get_net_proto,
+};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4);
+#endif
+
struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
{
.l3proto = PF_INET6,
@@ -322,3 +409,38 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
.get_net_proto = udp_get_net_proto,
};
EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6);
+
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
+{
+ .l3proto = PF_INET6,
+ .l4proto = IPPROTO_UDPLITE,
+ .name = "udplite",
+ .allow_clash = true,
+ .pkt_to_tuple = udp_pkt_to_tuple,
+ .invert_tuple = udp_invert_tuple,
+ .print_tuple = udp_print_tuple,
+ .packet = udp_packet,
+ .get_timeouts = udp_get_timeouts,
+ .new = udp_new,
+ .error = udplite_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = udp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = udp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_UDP_MAX,
+ .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
+ .nla_policy = udp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = udp_init_net,
+ .get_net_proto = udp_get_net_proto,
+};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6);
+#endif
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
deleted file mode 100644
index 029206e8dec4..000000000000
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ /dev/null
@@ -1,409 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2007 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/module.h>
-#include <linux/udp.h>
-#include <linux/seq_file.h>
-#include <linux/skbuff.h>
-#include <linux/ipv6.h>
-#include <net/ip6_checksum.h>
-#include <net/checksum.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv6.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-#include <net/netfilter/nf_log.h>
-
-enum udplite_conntrack {
- UDPLITE_CT_UNREPLIED,
- UDPLITE_CT_REPLIED,
- UDPLITE_CT_MAX
-};
-
-static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = {
- [UDPLITE_CT_UNREPLIED] = 30*HZ,
- [UDPLITE_CT_REPLIED] = 180*HZ,
-};
-
-static int udplite_net_id __read_mostly;
-struct udplite_net {
- struct nf_proto_net pn;
- unsigned int timeouts[UDPLITE_CT_MAX];
-};
-
-static inline struct udplite_net *udplite_pernet(struct net *net)
-{
- return net_generic(net, udplite_net_id);
-}
-
-static bool udplite_pkt_to_tuple(const struct sk_buff *skb,
- unsigned int dataoff,
- struct net *net,
- struct nf_conntrack_tuple *tuple)
-{
- const struct udphdr *hp;
- struct udphdr _hdr;
-
- /* Actually only need first 4 bytes to get ports. */
- hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
- if (hp == NULL)
- return false;
-
- tuple->src.u.udp.port = hp->source;
- tuple->dst.u.udp.port = hp->dest;
- return true;
-}
-
-static bool udplite_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- tuple->src.u.udp.port = orig->dst.u.udp.port;
- tuple->dst.u.udp.port = orig->src.u.udp.port;
- return true;
-}
-
-/* Print out the per-protocol part of the tuple. */
-static void udplite_print_tuple(struct seq_file *s,
- const struct nf_conntrack_tuple *tuple)
-{
- seq_printf(s, "sport=%hu dport=%hu ",
- ntohs(tuple->src.u.udp.port),
- ntohs(tuple->dst.u.udp.port));
-}
-
-static unsigned int *udplite_get_timeouts(struct net *net)
-{
- return udplite_pernet(net)->timeouts;
-}
-
-/* Returns verdict for packet, and may modify conntracktype */
-static int udplite_packet(struct nf_conn *ct,
- const struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- u_int8_t pf,
- unsigned int hooknum,
- unsigned int *timeouts)
-{
- /* If we've seen traffic both ways, this is some kind of UDP
- stream. Extend timeout. */
- if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- nf_ct_refresh_acct(ct, ctinfo, skb,
- timeouts[UDPLITE_CT_REPLIED]);
- /* Also, more likely to be important, and not a probe */
- if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
- nf_conntrack_event_cache(IPCT_ASSURED, ct);
- } else {
- nf_ct_refresh_acct(ct, ctinfo, skb,
- timeouts[UDPLITE_CT_UNREPLIED]);
- }
- return NF_ACCEPT;
-}
-
-/* Called when a new connection for this protocol found. */
-static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
-{
- return true;
-}
-
-static int udplite_error(struct net *net, struct nf_conn *tmpl,
- struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info *ctinfo,
- u_int8_t pf,
- unsigned int hooknum)
-{
- unsigned int udplen = skb->len - dataoff;
- const struct udphdr *hdr;
- struct udphdr _hdr;
- unsigned int cscov;
-
- /* Header is too small? */
- hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
- if (hdr == NULL) {
- if (LOG_INVALID(net, IPPROTO_UDPLITE))
- nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
- "nf_ct_udplite: short packet ");
- return -NF_ACCEPT;
- }
-
- cscov = ntohs(hdr->len);
- if (cscov == 0)
- cscov = udplen;
- else if (cscov < sizeof(*hdr) || cscov > udplen) {
- if (LOG_INVALID(net, IPPROTO_UDPLITE))
- nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
- "nf_ct_udplite: invalid checksum coverage ");
- return -NF_ACCEPT;
- }
-
- /* UDPLITE mandates checksums */
- if (!hdr->check) {
- if (LOG_INVALID(net, IPPROTO_UDPLITE))
- nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
- "nf_ct_udplite: checksum missing ");
- return -NF_ACCEPT;
- }
-
- /* Checksum invalid? Ignore. */
- if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
- nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
- pf)) {
- if (LOG_INVALID(net, IPPROTO_UDPLITE))
- nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
- "nf_ct_udplite: bad UDPLite checksum ");
- return -NF_ACCEPT;
- }
-
- return NF_ACCEPT;
-}
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_cttimeout.h>
-
-static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[],
- struct net *net, void *data)
-{
- unsigned int *timeouts = data;
- struct udplite_net *un = udplite_pernet(net);
-
- /* set default timeouts for UDPlite. */
- timeouts[UDPLITE_CT_UNREPLIED] = un->timeouts[UDPLITE_CT_UNREPLIED];
- timeouts[UDPLITE_CT_REPLIED] = un->timeouts[UDPLITE_CT_REPLIED];
-
- if (tb[CTA_TIMEOUT_UDPLITE_UNREPLIED]) {
- timeouts[UDPLITE_CT_UNREPLIED] =
- ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_UNREPLIED])) * HZ;
- }
- if (tb[CTA_TIMEOUT_UDPLITE_REPLIED]) {
- timeouts[UDPLITE_CT_REPLIED] =
- ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_REPLIED])) * HZ;
- }
- return 0;
-}
-
-static int
-udplite_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
-{
- const unsigned int *timeouts = data;
-
- if (nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_UNREPLIED,
- htonl(timeouts[UDPLITE_CT_UNREPLIED] / HZ)) ||
- nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_REPLIED,
- htonl(timeouts[UDPLITE_CT_REPLIED] / HZ)))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return -ENOSPC;
-}
-
-static const struct nla_policy
-udplite_timeout_nla_policy[CTA_TIMEOUT_UDPLITE_MAX+1] = {
- [CTA_TIMEOUT_UDPLITE_UNREPLIED] = { .type = NLA_U32 },
- [CTA_TIMEOUT_UDPLITE_REPLIED] = { .type = NLA_U32 },
-};
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-
-#ifdef CONFIG_SYSCTL
-static struct ctl_table udplite_sysctl_table[] = {
- {
- .procname = "nf_conntrack_udplite_timeout",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_udplite_timeout_stream",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- { }
-};
-#endif /* CONFIG_SYSCTL */
-
-static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn,
- struct udplite_net *un)
-{
-#ifdef CONFIG_SYSCTL
- if (pn->ctl_table)
- return 0;
-
- pn->ctl_table = kmemdup(udplite_sysctl_table,
- sizeof(udplite_sysctl_table),
- GFP_KERNEL);
- if (!pn->ctl_table)
- return -ENOMEM;
-
- pn->ctl_table[0].data = &un->timeouts[UDPLITE_CT_UNREPLIED];
- pn->ctl_table[1].data = &un->timeouts[UDPLITE_CT_REPLIED];
-#endif
- return 0;
-}
-
-static int udplite_init_net(struct net *net, u_int16_t proto)
-{
- struct udplite_net *un = udplite_pernet(net);
- struct nf_proto_net *pn = &un->pn;
-
- if (!pn->users) {
- int i;
-
- for (i = 0 ; i < UDPLITE_CT_MAX; i++)
- un->timeouts[i] = udplite_timeouts[i];
- }
-
- return udplite_kmemdup_sysctl_table(pn, un);
-}
-
-static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
-{
- .l3proto = PF_INET,
- .l4proto = IPPROTO_UDPLITE,
- .name = "udplite",
- .allow_clash = true,
- .pkt_to_tuple = udplite_pkt_to_tuple,
- .invert_tuple = udplite_invert_tuple,
- .print_tuple = udplite_print_tuple,
- .packet = udplite_packet,
- .get_timeouts = udplite_get_timeouts,
- .new = udplite_new,
- .error = udplite_error,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
- .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
- .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
- .nla_policy = nf_ct_port_nla_policy,
-#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
- .ctnl_timeout = {
- .nlattr_to_obj = udplite_timeout_nlattr_to_obj,
- .obj_to_nlattr = udplite_timeout_obj_to_nlattr,
- .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX,
- .obj_size = sizeof(unsigned int) *
- CTA_TIMEOUT_UDPLITE_MAX,
- .nla_policy = udplite_timeout_nla_policy,
- },
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .net_id = &udplite_net_id,
- .init_net = udplite_init_net,
-};
-
-static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
-{
- .l3proto = PF_INET6,
- .l4proto = IPPROTO_UDPLITE,
- .name = "udplite",
- .allow_clash = true,
- .pkt_to_tuple = udplite_pkt_to_tuple,
- .invert_tuple = udplite_invert_tuple,
- .print_tuple = udplite_print_tuple,
- .packet = udplite_packet,
- .get_timeouts = udplite_get_timeouts,
- .new = udplite_new,
- .error = udplite_error,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
- .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
- .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
- .nla_policy = nf_ct_port_nla_policy,
-#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
- .ctnl_timeout = {
- .nlattr_to_obj = udplite_timeout_nlattr_to_obj,
- .obj_to_nlattr = udplite_timeout_obj_to_nlattr,
- .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX,
- .obj_size = sizeof(unsigned int) *
- CTA_TIMEOUT_UDPLITE_MAX,
- .nla_policy = udplite_timeout_nla_policy,
- },
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .net_id = &udplite_net_id,
- .init_net = udplite_init_net,
-};
-
-static int udplite_net_init(struct net *net)
-{
- int ret = 0;
-
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite4);
- if (ret < 0) {
- pr_err("nf_conntrack_udplite4: pernet registration failed.\n");
- goto out;
- }
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite6);
- if (ret < 0) {
- pr_err("nf_conntrack_udplite6: pernet registration failed.\n");
- goto cleanup_udplite4;
- }
- return 0;
-
-cleanup_udplite4:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4);
-out:
- return ret;
-}
-
-static void udplite_net_exit(struct net *net)
-{
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite6);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4);
-}
-
-static struct pernet_operations udplite_net_ops = {
- .init = udplite_net_init,
- .exit = udplite_net_exit,
- .id = &udplite_net_id,
- .size = sizeof(struct udplite_net),
-};
-
-static int __init nf_conntrack_proto_udplite_init(void)
-{
- int ret;
-
- ret = register_pernet_subsys(&udplite_net_ops);
- if (ret < 0)
- goto out_pernet;
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite4);
- if (ret < 0)
- goto out_udplite4;
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite6);
- if (ret < 0)
- goto out_udplite6;
-
- return 0;
-out_udplite6:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
-out_udplite4:
- unregister_pernet_subsys(&udplite_net_ops);
-out_pernet:
- return ret;
-}
-
-static void __exit nf_conntrack_proto_udplite_exit(void)
-{
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite6);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
- unregister_pernet_subsys(&udplite_net_ops);
-}
-
-module_init(nf_conntrack_proto_udplite_init);
-module_exit(nf_conntrack_proto_udplite_exit);
-
-MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index c3fc14e021ec..0d17894798b5 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -809,13 +809,11 @@ static int refresh_signalling_expectation(struct nf_conn *ct,
exp->tuple.dst.protonum != proto ||
exp->tuple.dst.u.udp.port != port)
continue;
- if (!del_timer(&exp->timeout))
- continue;
- exp->flags &= ~NF_CT_EXPECT_INACTIVE;
- exp->timeout.expires = jiffies + expires * HZ;
- add_timer(&exp->timeout);
- found = 1;
- break;
+ if (mod_timer_pending(&exp->timeout, jiffies + expires * HZ)) {
+ exp->flags &= ~NF_CT_EXPECT_INACTIVE;
+ found = 1;
+ break;
+ }
}
spin_unlock_bh(&nf_conntrack_expect_lock);
return found;
@@ -1630,8 +1628,6 @@ static int __init nf_conntrack_sip_init(void)
ports[ports_c++] = SIP_PORT;
for (i = 0; i < ports_c; i++) {
- memset(&sip[i], 0, sizeof(sip[i]));
-
nf_ct_helper_init(&sip[4 * i], AF_INET, IPPROTO_UDP, "sip",
SIP_PORT, ports[i], i, sip_exp_policy,
SIP_EXPECT_MAX,
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 5f446cd9f3fd..2256147dcaad 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -452,6 +452,9 @@ static int log_invalid_proto_max __read_mostly = 255;
/* size the user *wants to set */
static unsigned int nf_conntrack_htable_size_user __read_mostly;
+extern unsigned int nf_conntrack_default_on;
+unsigned int nf_conntrack_default_on __read_mostly = 1;
+
static int
nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -517,6 +520,13 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "nf_conntrack_default_on",
+ .data = &nf_conntrack_default_on,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};
@@ -632,6 +642,9 @@ static int __init nf_conntrack_standalone_init(void)
if (ret < 0)
goto out_start;
+ BUILD_BUG_ON(SKB_NFCT_PTRMASK != NFCT_PTRMASK);
+ BUILD_BUG_ON(NFCT_INFOMASK <= IP_CT_NUMBER);
+
#ifdef CONFIG_SYSCTL
nf_ct_netfilter_header =
register_net_sysctl(&init_net, "net", nf_ct_netfilter_table);
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index 7ec69723940f..c9d7f95768ab 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -14,24 +14,41 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
+static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev)
+{
+ if (skb_mac_header_was_set(skb))
+ skb_push(skb, skb->mac_len);
+
+ skb->dev = dev;
+ dev_queue_xmit(skb);
+}
+
+void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index_rcu(nft_net(pkt), oif);
+ if (!dev) {
+ kfree_skb(pkt->skb);
+ return;
+ }
+
+ nf_do_netdev_egress(pkt->skb, dev);
+}
+EXPORT_SYMBOL_GPL(nf_fwd_netdev_egress);
+
void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif)
{
struct net_device *dev;
struct sk_buff *skb;
- dev = dev_get_by_index_rcu(pkt->net, oif);
+ dev = dev_get_by_index_rcu(nft_net(pkt), oif);
if (dev == NULL)
return;
skb = skb_clone(pkt->skb, GFP_ATOMIC);
- if (skb == NULL)
- return;
-
- if (skb_mac_header_was_set(skb))
- skb_push(skb, skb->mac_len);
-
- skb->dev = dev;
- dev_queue_xmit(skb);
+ if (skb)
+ nf_do_netdev_egress(skb, dev);
}
EXPORT_SYMBOL_GPL(nf_dup_netdev_egress);
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 9fdb655f85bc..c46d214d5323 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -11,11 +11,6 @@
#define NFDEBUG(format, args...)
#endif
-
-/* core.c */
-unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state,
- struct nf_hook_entry **entryp);
-
/* nf_queue.c */
int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
struct nf_hook_entry **entryp, unsigned int verdict);
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 3dca90dc24ad..8d85a0598b60 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -13,9 +13,11 @@
/* Internal logging interface, which relies on the real
LOG target modules */
-#define NF_LOG_PREFIXLEN 128
#define NFLOGGER_NAME_LEN 64
+int sysctl_nf_log_all_netns __read_mostly;
+EXPORT_SYMBOL(sysctl_nf_log_all_netns);
+
static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly;
static DEFINE_MUTEX(nf_log_mutex);
@@ -414,6 +416,18 @@ static const struct file_operations nflog_file_ops = {
#ifdef CONFIG_SYSCTL
static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
+static struct ctl_table_header *nf_log_sysctl_fhdr;
+
+static struct ctl_table nf_log_sysctl_ftable[] = {
+ {
+ .procname = "nf_log_all_netns",
+ .data = &sysctl_nf_log_all_netns,
+ .maxlen = sizeof(sysctl_nf_log_all_netns),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
static int nf_log_proc_dostring(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -483,6 +497,10 @@ static int netfilter_log_sysctl_init(struct net *net)
nf_log_sysctl_table[i].extra1 =
(void *)(unsigned long) i;
}
+ nf_log_sysctl_fhdr = register_net_sysctl(net, "net/netfilter",
+ nf_log_sysctl_ftable);
+ if (!nf_log_sysctl_fhdr)
+ goto err_freg;
}
for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
@@ -499,6 +517,9 @@ static int netfilter_log_sysctl_init(struct net *net)
err_reg:
if (!net_eq(net, &init_net))
kfree(table);
+ else
+ unregister_net_sysctl_table(nf_log_sysctl_fhdr);
+err_freg:
err_alloc:
return -ENOMEM;
}
@@ -511,6 +532,8 @@ static void netfilter_log_sysctl_exit(struct net *net)
unregister_net_sysctl_table(net->nf.nf_log_dir_header);
if (!net_eq(net, &init_net))
kfree(table);
+ else
+ unregister_net_sysctl_table(nf_log_sysctl_fhdr);
}
#else
static int netfilter_log_sysctl_init(struct net *net)
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
index 119fe1cb1ea9..dc61399e30be 100644
--- a/net/netfilter/nf_log_common.c
+++ b/net/netfilter/nf_log_common.c
@@ -175,6 +175,34 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf,
}
EXPORT_SYMBOL_GPL(nf_log_dump_packet_common);
+/* bridge and netdev logging families share this code. */
+void nf_log_l2packet(struct net *net, u_int8_t pf,
+ __be16 protocol,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ switch (protocol) {
+ case htons(ETH_P_IP):
+ nf_log_packet(net, NFPROTO_IPV4, hooknum, skb, in, out,
+ loginfo, "%s", prefix);
+ break;
+ case htons(ETH_P_IPV6):
+ nf_log_packet(net, NFPROTO_IPV6, hooknum, skb, in, out,
+ loginfo, "%s", prefix);
+ break;
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_RARP):
+ nf_log_packet(net, NFPROTO_ARP, hooknum, skb, in, out,
+ loginfo, "%s", prefix);
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(nf_log_l2packet);
+
static int __init nf_log_common_init(void)
{
return 0;
diff --git a/net/netfilter/nf_log_netdev.c b/net/netfilter/nf_log_netdev.c
new file mode 100644
index 000000000000..350eb147754d
--- /dev/null
+++ b/net/netfilter/nf_log_netdev.c
@@ -0,0 +1,81 @@
+/*
+ * (C) 2016 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_log.h>
+
+static void nf_log_netdev_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ nf_log_l2packet(net, pf, skb->protocol, hooknum, skb, in, out,
+ loginfo, prefix);
+}
+
+static struct nf_logger nf_netdev_logger __read_mostly = {
+ .name = "nf_log_netdev",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_netdev_packet,
+ .me = THIS_MODULE,
+};
+
+static int __net_init nf_log_netdev_net_init(struct net *net)
+{
+ return nf_log_set(net, NFPROTO_NETDEV, &nf_netdev_logger);
+}
+
+static void __net_exit nf_log_netdev_net_exit(struct net *net)
+{
+ nf_log_unset(net, &nf_netdev_logger);
+}
+
+static struct pernet_operations nf_log_netdev_net_ops = {
+ .init = nf_log_netdev_net_init,
+ .exit = nf_log_netdev_net_exit,
+};
+
+static int __init nf_log_netdev_init(void)
+{
+ int ret;
+
+ /* Request to load the real packet loggers. */
+ nf_logger_request_module(NFPROTO_IPV4, NF_LOG_TYPE_LOG);
+ nf_logger_request_module(NFPROTO_IPV6, NF_LOG_TYPE_LOG);
+ nf_logger_request_module(NFPROTO_ARP, NF_LOG_TYPE_LOG);
+
+ ret = register_pernet_subsys(&nf_log_netdev_net_ops);
+ if (ret < 0)
+ return ret;
+
+ nf_log_register(NFPROTO_NETDEV, &nf_netdev_logger);
+ return 0;
+}
+
+static void __exit nf_log_netdev_exit(void)
+{
+ unregister_pernet_subsys(&nf_log_netdev_net_ops);
+ nf_log_unregister(&nf_netdev_logger);
+}
+
+module_init(nf_log_netdev_init);
+module_exit(nf_log_netdev_exit);
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter netdev packet logging");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NF_LOGGER(5, 0); /* NFPROTO_NETDEV */
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 5b9c884a452e..82802e4a6640 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -682,6 +682,18 @@ int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto)
&nf_nat_l4proto_tcp);
RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP],
&nf_nat_l4proto_udp);
+#ifdef CONFIG_NF_NAT_PROTO_DCCP
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_DCCP],
+ &nf_nat_l4proto_dccp);
+#endif
+#ifdef CONFIG_NF_NAT_PROTO_SCTP
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_SCTP],
+ &nf_nat_l4proto_sctp);
+#endif
+#ifdef CONFIG_NF_NAT_PROTO_UDPLITE
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDPLITE],
+ &nf_nat_l4proto_udplite);
+#endif
mutex_unlock(&nf_nat_proto_mutex);
RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto);
@@ -891,6 +903,8 @@ static void __exit nf_nat_cleanup(void)
#ifdef CONFIG_XFRM
RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL);
#endif
+ synchronize_rcu();
+
for (i = 0; i < NFPROTO_NUMPROTO; i++)
kfree(nf_nat_l4protos[i]);
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 2840abb5bb99..211661cb2c90 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -60,7 +60,7 @@ static void mangle_contents(struct sk_buff *skb,
__skb_trim(skb, skb->len + rep_len - match_len);
}
- if (nf_ct_l3num((struct nf_conn *)skb->nfct) == NFPROTO_IPV4) {
+ if (nf_ct_l3num((struct nf_conn *)skb_nfct(skb)) == NFPROTO_IPV4) {
/* fix IP hdr checksum information */
ip_hdr(skb)->tot_len = htons(skb->len);
ip_send_check(ip_hdr(skb));
diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c
index 15c47b246d0d..269fcd5dc34c 100644
--- a/net/netfilter/nf_nat_proto_dccp.c
+++ b/net/netfilter/nf_nat_proto_dccp.c
@@ -10,8 +10,6 @@
*/
#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/dccp.h>
@@ -73,7 +71,7 @@ dccp_manip_pkt(struct sk_buff *skb,
return true;
}
-static const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
+const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
.l4proto = IPPROTO_DCCP,
.manip_pkt = dccp_manip_pkt,
.in_range = nf_nat_l4proto_in_range,
@@ -82,35 +80,3 @@ static const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
.nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
#endif
};
-
-static int __init nf_nat_proto_dccp_init(void)
-{
- int err;
-
- err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
- if (err < 0)
- goto err1;
- err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_dccp);
- if (err < 0)
- goto err2;
- return 0;
-
-err2:
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
-err1:
- return err;
-}
-
-static void __exit nf_nat_proto_dccp_fini(void)
-{
- nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_dccp);
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
-
-}
-
-module_init(nf_nat_proto_dccp_init);
-module_exit(nf_nat_proto_dccp_fini);
-
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("DCCP NAT protocol helper");
-MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
index cbc7ade1487b..804e8a0ab36e 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -7,9 +7,7 @@
*/
#include <linux/types.h>
-#include <linux/init.h>
#include <linux/sctp.h>
-#include <linux/module.h>
#include <net/sctp/checksum.h>
#include <net/netfilter/nf_nat_l4proto.h>
@@ -35,8 +33,16 @@ sctp_manip_pkt(struct sk_buff *skb,
enum nf_nat_manip_type maniptype)
{
sctp_sctphdr_t *hdr;
+ int hdrsize = 8;
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ /* This could be an inner header returned in imcp packet; in such
+ * cases we cannot update the checksum field since it is outside
+ * of the 8 bytes of transport layer headers we are guaranteed.
+ */
+ if (skb->len >= hdroff + sizeof(*hdr))
+ hdrsize = sizeof(*hdr);
+
+ if (!skb_make_writable(skb, hdroff + hdrsize))
return false;
hdr = (struct sctphdr *)(skb->data + hdroff);
@@ -49,12 +55,18 @@ sctp_manip_pkt(struct sk_buff *skb,
hdr->dest = tuple->dst.u.sctp.port;
}
- hdr->checksum = sctp_compute_cksum(skb, hdroff);
+ if (hdrsize < sizeof(*hdr))
+ return true;
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ hdr->checksum = sctp_compute_cksum(skb, hdroff);
+ skb->ip_summed = CHECKSUM_NONE;
+ }
return true;
}
-static const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
+const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
.l4proto = IPPROTO_SCTP,
.manip_pkt = sctp_manip_pkt,
.in_range = nf_nat_l4proto_in_range,
@@ -63,34 +75,3 @@ static const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
.nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
#endif
};
-
-static int __init nf_nat_proto_sctp_init(void)
-{
- int err;
-
- err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
- if (err < 0)
- goto err1;
- err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_sctp);
- if (err < 0)
- goto err2;
- return 0;
-
-err2:
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
-err1:
- return err;
-}
-
-static void __exit nf_nat_proto_sctp_exit(void)
-{
- nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_sctp);
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
-}
-
-module_init(nf_nat_proto_sctp_init);
-module_exit(nf_nat_proto_sctp_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SCTP NAT protocol helper");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c
index b1e627227b6e..edd4a77dc09a 100644
--- a/net/netfilter/nf_nat_proto_udp.c
+++ b/net/netfilter/nf_nat_proto_udp.c
@@ -30,20 +30,15 @@ udp_unique_tuple(const struct nf_nat_l3proto *l3proto,
&udp_port_rover);
}
-static bool
-udp_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
+static void
+__udp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, struct udphdr *hdr,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype, bool do_csum)
{
- struct udphdr *hdr;
__be16 *portptr, newport;
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
- return false;
- hdr = (struct udphdr *)(skb->data + hdroff);
-
if (maniptype == NF_NAT_MANIP_SRC) {
/* Get rid of src port */
newport = tuple->src.u.udp.port;
@@ -53,7 +48,7 @@ udp_manip_pkt(struct sk_buff *skb,
newport = tuple->dst.u.udp.port;
portptr = &hdr->dest;
}
- if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ if (do_csum) {
l3proto->csum_update(skb, iphdroff, &hdr->check,
tuple, maniptype);
inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
@@ -62,9 +57,68 @@ udp_manip_pkt(struct sk_buff *skb,
hdr->check = CSUM_MANGLED_0;
}
*portptr = newport;
+}
+
+static bool udp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct udphdr *hdr;
+ bool do_csum;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+
+ hdr = (struct udphdr *)(skb->data + hdroff);
+ do_csum = hdr->check || skb->ip_summed == CHECKSUM_PARTIAL;
+
+ __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, do_csum);
+ return true;
+}
+
+#ifdef CONFIG_NF_NAT_PROTO_UDPLITE
+static u16 udplite_port_rover;
+
+static bool udplite_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct udphdr *hdr;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+
+ hdr = (struct udphdr *)(skb->data + hdroff);
+ __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, true);
return true;
}
+static void
+udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+ &udplite_port_rover);
+}
+
+const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
+ .l4proto = IPPROTO_UDPLITE,
+ .manip_pkt = udplite_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = udplite_unique_tuple,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
+#endif /* CONFIG_NF_NAT_PROTO_UDPLITE */
+
const struct nf_nat_l4proto nf_nat_l4proto_udp = {
.l4proto = IPPROTO_UDP,
.manip_pkt = udp_manip_pkt,
diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c
deleted file mode 100644
index 58340c97bd83..000000000000
--- a/net/netfilter/nf_nat_proto_udplite.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/udp.h>
-
-#include <linux/netfilter.h>
-#include <linux/module.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-
-static u16 udplite_port_rover;
-
-static void
-udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
- struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
- &udplite_port_rover);
-}
-
-static bool
-udplite_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- struct udphdr *hdr;
- __be16 *portptr, newport;
-
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
- return false;
-
- hdr = (struct udphdr *)(skb->data + hdroff);
-
- if (maniptype == NF_NAT_MANIP_SRC) {
- /* Get rid of source port */
- newport = tuple->src.u.udp.port;
- portptr = &hdr->source;
- } else {
- /* Get rid of dst port */
- newport = tuple->dst.u.udp.port;
- portptr = &hdr->dest;
- }
-
- l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
- inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, false);
- if (!hdr->check)
- hdr->check = CSUM_MANGLED_0;
-
- *portptr = newport;
- return true;
-}
-
-static const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
- .l4proto = IPPROTO_UDPLITE,
- .manip_pkt = udplite_manip_pkt,
- .in_range = nf_nat_l4proto_in_range,
- .unique_tuple = udplite_unique_tuple,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
-#endif
-};
-
-static int __init nf_nat_proto_udplite_init(void)
-{
- int err;
-
- err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
- if (err < 0)
- goto err1;
- err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_udplite);
- if (err < 0)
- goto err2;
- return 0;
-
-err2:
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
-err1:
- return err;
-}
-
-static void __exit nf_nat_proto_udplite_fini(void)
-{
- nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_udplite);
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
-}
-
-module_init(nf_nat_proto_udplite_init);
-module_exit(nf_nat_proto_udplite_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("UDP-Lite NAT protocol helper");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index d43869879fcf..86067560a318 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -101,11 +101,13 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
rcu_read_lock();
idev = __in6_dev_get(skb->dev);
if (idev != NULL) {
+ read_lock_bh(&idev->lock);
list_for_each_entry(ifa, &idev->addr_list, if_list) {
newdst = ifa->addr;
addr = true;
break;
}
+ read_unlock_bh(&idev->lock);
}
rcu_read_unlock();
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 8f08d759844a..4a7662486f44 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -108,7 +108,7 @@ void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry)
}
static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
- unsigned int queuenum)
+ struct nf_hook_entry *hook_entry, unsigned int queuenum)
{
int status = -ENOENT;
struct nf_queue_entry *entry = NULL;
@@ -136,6 +136,7 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
*entry = (struct nf_queue_entry) {
.skb = skb,
.state = *state,
+ .hook = hook_entry,
.size = sizeof(*entry) + afinfo->route_key_size,
};
@@ -163,8 +164,7 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
struct nf_hook_entry *entry = *entryp;
int ret;
- RCU_INIT_POINTER(state->hook_entries, entry);
- ret = __nf_queue(skb, state, verdict >> NF_VERDICT_QBITS);
+ ret = __nf_queue(skb, state, entry, verdict >> NF_VERDICT_QBITS);
if (ret < 0) {
if (ret == -ESRCH &&
(verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) {
@@ -177,22 +177,38 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
return 0;
}
+static unsigned int nf_iterate(struct sk_buff *skb,
+ struct nf_hook_state *state,
+ struct nf_hook_entry **entryp)
+{
+ unsigned int verdict;
+
+ do {
+repeat:
+ verdict = nf_hook_entry_hookfn((*entryp), skb, state);
+ if (verdict != NF_ACCEPT) {
+ if (verdict != NF_REPEAT)
+ return verdict;
+ goto repeat;
+ }
+ *entryp = rcu_dereference((*entryp)->next);
+ } while (*entryp);
+
+ return NF_ACCEPT;
+}
+
void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
- struct nf_hook_entry *hook_entry;
+ struct nf_hook_entry *hook_entry = entry->hook;
struct sk_buff *skb = entry->skb;
const struct nf_afinfo *afinfo;
- struct nf_hook_ops *elem;
int err;
- hook_entry = rcu_dereference(entry->state.hook_entries);
- elem = &hook_entry->ops;
-
nf_queue_entry_release_refs(entry);
/* Continue traversal iff userspace said ok... */
if (verdict == NF_REPEAT)
- verdict = elem->hook(elem->priv, skb, &entry->state);
+ verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state);
if (verdict == NF_ACCEPT) {
afinfo = nf_get_afinfo(entry->state.pf);
@@ -200,8 +216,6 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
verdict = NF_DROP;
}
- entry->state.thresh = INT_MIN;
-
if (verdict == NF_ACCEPT) {
hook_entry = rcu_dereference(hook_entry->next);
if (hook_entry)
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index c8a4a48bced9..7c6d1fbe38b9 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -24,7 +24,7 @@
#include <net/netfilter/nf_conntrack_synproxy.h>
#include <net/netfilter/nf_conntrack_zones.h>
-int synproxy_net_id;
+unsigned int synproxy_net_id;
EXPORT_SYMBOL_GPL(synproxy_net_id);
bool
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index e5194f6f906c..434c739dfeca 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -22,6 +22,7 @@
#include <net/sock.h>
static LIST_HEAD(nf_tables_expressions);
+static LIST_HEAD(nf_tables_objects);
/**
* nft_register_afinfo - register nf_tables address family info
@@ -110,12 +111,12 @@ static void nft_ctx_init(struct nft_ctx *ctx,
ctx->seq = nlh->nlmsg_seq;
}
-static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type,
- u32 size)
+static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx,
+ int msg_type, u32 size, gfp_t gfp)
{
struct nft_trans *trans;
- trans = kzalloc(sizeof(struct nft_trans) + size, GFP_KERNEL);
+ trans = kzalloc(sizeof(struct nft_trans) + size, gfp);
if (trans == NULL)
return NULL;
@@ -125,6 +126,12 @@ static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type,
return trans;
}
+static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx,
+ int msg_type, u32 size)
+{
+ return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL);
+}
+
static void nft_trans_destroy(struct nft_trans *trans)
{
list_del(&trans->list);
@@ -233,6 +240,10 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type,
if (trans == NULL)
return NULL;
+ if (msg_type == NFT_MSG_NEWRULE && ctx->nla[NFTA_RULE_ID] != NULL) {
+ nft_trans_rule_id(trans) =
+ ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID]));
+ }
nft_trans_rule(trans) = rule;
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
@@ -304,6 +315,38 @@ static int nft_delset(struct nft_ctx *ctx, struct nft_set *set)
return err;
}
+static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type,
+ struct nft_object *obj)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_obj));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ if (msg_type == NFT_MSG_NEWOBJ)
+ nft_activate_next(ctx->net, obj);
+
+ nft_trans_obj(trans) = obj;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+ return 0;
+}
+
+static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj)
+{
+ int err;
+
+ err = nft_trans_obj_add(ctx, NFT_MSG_DELOBJ, obj);
+ if (err < 0)
+ return err;
+
+ nft_deactivate_next(ctx->net, obj);
+ ctx->table->use--;
+
+ return err;
+}
+
/*
* Tables
*/
@@ -418,16 +461,15 @@ nla_put_failure:
return -1;
}
-static int nf_tables_table_notify(const struct nft_ctx *ctx, int event)
+static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
{
struct sk_buff *skb;
int err;
if (!ctx->report &&
!nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
- return 0;
+ return;
- err = -ENOBUFS;
skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
if (skb == NULL)
goto err;
@@ -439,14 +481,11 @@ static int nf_tables_table_notify(const struct nft_ctx *ctx, int event)
goto err;
}
- err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ ctx->report, GFP_KERNEL);
+ return;
err:
- if (err < 0) {
- nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- err);
- }
- return err;
+ nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
}
static int nf_tables_dump_tables(struct sk_buff *skb,
@@ -537,6 +576,28 @@ err:
return err;
}
+static void _nf_tables_table_disable(struct net *net,
+ const struct nft_af_info *afi,
+ struct nft_table *table,
+ u32 cnt)
+{
+ struct nft_chain *chain;
+ u32 i = 0;
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!nft_is_active_next(net, chain))
+ continue;
+ if (!(chain->flags & NFT_BASE_CHAIN))
+ continue;
+
+ if (cnt && i++ == cnt)
+ break;
+
+ nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
+ afi->nops);
+ }
+}
+
static int nf_tables_table_enable(struct net *net,
const struct nft_af_info *afi,
struct nft_table *table)
@@ -559,18 +620,8 @@ static int nf_tables_table_enable(struct net *net,
}
return 0;
err:
- list_for_each_entry(chain, &table->chains, list) {
- if (!nft_is_active_next(net, chain))
- continue;
- if (!(chain->flags & NFT_BASE_CHAIN))
- continue;
-
- if (i-- <= 0)
- break;
-
- nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
- afi->nops);
- }
+ if (i)
+ _nf_tables_table_disable(net, afi, table, i);
return err;
}
@@ -578,17 +629,7 @@ static void nf_tables_table_disable(struct net *net,
const struct nft_af_info *afi,
struct nft_table *table)
{
- struct nft_chain *chain;
-
- list_for_each_entry(chain, &table->chains, list) {
- if (!nft_is_active_next(net, chain))
- continue;
- if (!(chain->flags & NFT_BASE_CHAIN))
- continue;
-
- nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
- afi->nops);
- }
+ _nf_tables_table_disable(net, afi, table, 0);
}
static int nf_tables_updtable(struct nft_ctx *ctx)
@@ -657,10 +698,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
if (IS_ERR(table)) {
if (PTR_ERR(table) != -ENOENT)
return PTR_ERR(table);
- table = NULL;
- }
-
- if (table != NULL) {
+ } else {
if (nlh->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
if (nlh->nlmsg_flags & NLM_F_REPLACE)
@@ -688,6 +726,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN);
INIT_LIST_HEAD(&table->chains);
INIT_LIST_HEAD(&table->sets);
+ INIT_LIST_HEAD(&table->objects);
table->flags = flags;
nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
@@ -709,6 +748,7 @@ static int nft_flush_table(struct nft_ctx *ctx)
{
int err;
struct nft_chain *chain, *nc;
+ struct nft_object *obj, *ne;
struct nft_set *set, *ns;
list_for_each_entry(chain, &ctx->table->chains, list) {
@@ -735,6 +775,12 @@ static int nft_flush_table(struct nft_ctx *ctx)
goto out;
}
+ list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) {
+ err = nft_delobj(ctx, obj);
+ if (err < 0)
+ goto out;
+ }
+
list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) {
if (!nft_is_active_next(ctx->net, chain))
continue;
@@ -881,7 +927,8 @@ static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table,
}
static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
- [NFTA_CHAIN_TABLE] = { .type = NLA_STRING },
+ [NFTA_CHAIN_TABLE] = { .type = NLA_STRING,
+ .len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_CHAIN_HANDLE] = { .type = NLA_U64 },
[NFTA_CHAIN_NAME] = { .type = NLA_STRING,
.len = NFT_CHAIN_MAXNAMELEN - 1 },
@@ -999,16 +1046,15 @@ nla_put_failure:
return -1;
}
-static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
+static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
{
struct sk_buff *skb;
int err;
if (!ctx->report &&
!nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
- return 0;
+ return;
- err = -ENOBUFS;
skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
if (skb == NULL)
goto err;
@@ -1021,14 +1067,11 @@ static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
goto err;
}
- err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ ctx->report, GFP_KERNEL);
+ return;
err:
- if (err < 0) {
- nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- err);
- }
- return err;
+ nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
}
static int nf_tables_dump_chains(struct sk_buff *skb,
@@ -1807,7 +1850,8 @@ static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain,
}
static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
- [NFTA_RULE_TABLE] = { .type = NLA_STRING },
+ [NFTA_RULE_TABLE] = { .type = NLA_STRING,
+ .len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_RULE_CHAIN] = { .type = NLA_STRING,
.len = NFT_CHAIN_MAXNAMELEN - 1 },
[NFTA_RULE_HANDLE] = { .type = NLA_U64 },
@@ -1882,18 +1926,16 @@ nla_put_failure:
return -1;
}
-static int nf_tables_rule_notify(const struct nft_ctx *ctx,
- const struct nft_rule *rule,
- int event)
+static void nf_tables_rule_notify(const struct nft_ctx *ctx,
+ const struct nft_rule *rule, int event)
{
struct sk_buff *skb;
int err;
if (!ctx->report &&
!nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
- return 0;
+ return;
- err = -ENOBUFS;
skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
if (skb == NULL)
goto err;
@@ -1906,14 +1948,11 @@ static int nf_tables_rule_notify(const struct nft_ctx *ctx,
goto err;
}
- err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ ctx->report, GFP_KERNEL);
+ return;
err:
- if (err < 0) {
- nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- err);
- }
- return err;
+ nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
}
struct nft_rule_dump_ctx {
@@ -2068,7 +2107,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
* is called on error from nf_tables_newrule().
*/
expr = nft_expr_first(rule);
- while (expr->ops && expr != nft_expr_last(rule)) {
+ while (expr != nft_expr_last(rule) && expr->ops) {
nf_tables_expr_destroy(ctx, expr);
expr = nft_expr_next(expr);
}
@@ -2245,6 +2284,22 @@ err1:
return err;
}
+static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
+ const struct nlattr *nla)
+{
+ u32 id = ntohl(nla_get_be32(nla));
+ struct nft_trans *trans;
+
+ list_for_each_entry(trans, &net->nft.commit_list, list) {
+ struct nft_rule *rule = nft_trans_rule(trans);
+
+ if (trans->msg_type == NFT_MSG_NEWRULE &&
+ id == nft_trans_rule_id(trans))
+ return rule;
+ }
+ return ERR_PTR(-ENOENT);
+}
+
static int nf_tables_delrule(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
@@ -2283,6 +2338,12 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
return PTR_ERR(rule);
err = nft_delrule(&ctx, rule);
+ } else if (nla[NFTA_RULE_ID]) {
+ rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]);
+ if (IS_ERR(rule))
+ return PTR_ERR(rule);
+
+ err = nft_delrule(&ctx, rule);
} else {
err = nft_delrule_by_chain(&ctx);
}
@@ -2350,12 +2411,14 @@ nft_select_set_ops(const struct nlattr * const nla[],
features = 0;
if (nla[NFTA_SET_FLAGS] != NULL) {
features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
- features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT;
+ features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT |
+ NFT_SET_OBJECT;
}
- bops = NULL;
- best.size = ~0;
- best.class = ~0;
+ bops = NULL;
+ best.size = ~0;
+ best.lookup = ~0;
+ best.space = ~0;
list_for_each_entry(ops, &nf_tables_set_ops, list) {
if ((ops->features & features) != features)
@@ -2365,16 +2428,27 @@ nft_select_set_ops(const struct nlattr * const nla[],
switch (policy) {
case NFT_SET_POL_PERFORMANCE:
- if (est.class < best.class)
- break;
- if (est.class == best.class && est.size < best.size)
+ if (est.lookup < best.lookup)
break;
+ if (est.lookup == best.lookup) {
+ if (!desc->size) {
+ if (est.space < best.space)
+ break;
+ } else if (est.size < best.size) {
+ break;
+ }
+ }
continue;
case NFT_SET_POL_MEMORY:
- if (est.size < best.size)
- break;
- if (est.size == best.size && est.class < best.class)
+ if (!desc->size) {
+ if (est.space < best.space)
+ break;
+ if (est.space == best.space &&
+ est.lookup < best.lookup)
+ break;
+ } else if (est.size < best.size) {
break;
+ }
continue;
default:
break;
@@ -2396,7 +2470,8 @@ nft_select_set_ops(const struct nlattr * const nla[],
}
static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
- [NFTA_SET_TABLE] = { .type = NLA_STRING },
+ [NFTA_SET_TABLE] = { .type = NLA_STRING,
+ .len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_SET_NAME] = { .type = NLA_STRING,
.len = NFT_SET_MAXNAMELEN - 1 },
[NFTA_SET_FLAGS] = { .type = NLA_U32 },
@@ -2411,6 +2486,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
[NFTA_SET_GC_INTERVAL] = { .type = NLA_U32 },
[NFTA_SET_USERDATA] = { .type = NLA_BINARY,
.len = NFT_USERDATA_MAXLEN },
+ [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 },
};
static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
@@ -2462,6 +2538,7 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
}
return ERR_PTR(-ENOENT);
}
+EXPORT_SYMBOL_GPL(nf_tables_set_lookup);
struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
const struct nlattr *nla,
@@ -2480,6 +2557,7 @@ struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
}
return ERR_PTR(-ENOENT);
}
+EXPORT_SYMBOL_GPL(nf_tables_set_lookup_byid);
static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
const char *name)
@@ -2568,6 +2646,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen)))
goto nla_put_failure;
}
+ if (set->flags & NFT_SET_OBJECT &&
+ nla_put_be32(skb, NFTA_SET_OBJ_TYPE, htonl(set->objtype)))
+ goto nla_put_failure;
if (set->timeout &&
nla_put_be64(skb, NFTA_SET_TIMEOUT,
@@ -2602,9 +2683,9 @@ nla_put_failure:
return -1;
}
-static int nf_tables_set_notify(const struct nft_ctx *ctx,
- const struct nft_set *set,
- int event, gfp_t gfp_flags)
+static void nf_tables_set_notify(const struct nft_ctx *ctx,
+ const struct nft_set *set, int event,
+ gfp_t gfp_flags)
{
struct sk_buff *skb;
u32 portid = ctx->portid;
@@ -2612,9 +2693,8 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx,
if (!ctx->report &&
!nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
- return 0;
+ return;
- err = -ENOBUFS;
skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags);
if (skb == NULL)
goto err;
@@ -2625,12 +2705,11 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx,
goto err;
}
- err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES,
- ctx->report, gfp_flags);
+ nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, ctx->report,
+ gfp_flags);
+ return;
err:
- if (err < 0)
- nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err);
- return err;
+ nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
}
static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
@@ -2797,7 +2876,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
unsigned int size;
bool create;
u64 timeout;
- u32 ktype, dtype, flags, policy, gc_int;
+ u32 ktype, dtype, flags, policy, gc_int, objtype;
struct nft_set_desc desc;
unsigned char *udata;
u16 udlen;
@@ -2827,11 +2906,12 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT |
NFT_SET_INTERVAL | NFT_SET_TIMEOUT |
- NFT_SET_MAP | NFT_SET_EVAL))
+ NFT_SET_MAP | NFT_SET_EVAL |
+ NFT_SET_OBJECT))
return -EINVAL;
- /* Only one of both operations is supported */
- if ((flags & (NFT_SET_MAP | NFT_SET_EVAL)) ==
- (NFT_SET_MAP | NFT_SET_EVAL))
+ /* Only one of these operations is supported */
+ if ((flags & (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) ==
+ (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT))
return -EOPNOTSUPP;
}
@@ -2856,6 +2936,19 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
} else if (flags & NFT_SET_MAP)
return -EINVAL;
+ if (nla[NFTA_SET_OBJ_TYPE] != NULL) {
+ if (!(flags & NFT_SET_OBJECT))
+ return -EINVAL;
+
+ objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE]));
+ if (objtype == NFT_OBJECT_UNSPEC ||
+ objtype > NFT_OBJECT_MAX)
+ return -EINVAL;
+ } else if (flags & NFT_SET_OBJECT)
+ return -EINVAL;
+ else
+ objtype = NFT_OBJECT_UNSPEC;
+
timeout = 0;
if (nla[NFTA_SET_TIMEOUT] != NULL) {
if (!(flags & NFT_SET_TIMEOUT))
@@ -2896,10 +2989,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
if (IS_ERR(set)) {
if (PTR_ERR(set) != -ENOENT)
return PTR_ERR(set);
- set = NULL;
- }
-
- if (set != NULL) {
+ } else {
if (nlh->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
if (nlh->nlmsg_flags & NLM_F_REPLACE)
@@ -2943,6 +3033,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
set->ktype = ktype;
set->klen = desc.klen;
set->dtype = dtype;
+ set->objtype = objtype;
set->dlen = desc.dlen;
set->flags = flags;
set->size = desc.size;
@@ -3016,9 +3107,9 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
}
static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
- const struct nft_set *set,
+ struct nft_set *set,
const struct nft_set_iter *iter,
- const struct nft_set_elem *elem)
+ struct nft_set_elem *elem)
{
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
enum nft_registers dreg;
@@ -3064,6 +3155,7 @@ bind:
list_add_tail_rcu(&binding->list, &set->bindings);
return 0;
}
+EXPORT_SYMBOL_GPL(nf_tables_bind_set);
void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_binding *binding)
@@ -3074,6 +3166,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
nft_is_active(ctx->net, set))
nf_tables_set_destroy(ctx, set);
}
+EXPORT_SYMBOL_GPL(nf_tables_unbind_set);
const struct nft_set_ext_type nft_set_ext_types[] = {
[NFT_SET_EXT_KEY] = {
@@ -3085,6 +3178,10 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
[NFT_SET_EXT_EXPR] = {
.align = __alignof__(struct nft_expr),
},
+ [NFT_SET_EXT_OBJREF] = {
+ .len = sizeof(struct nft_object *),
+ .align = __alignof__(struct nft_object *),
+ },
[NFT_SET_EXT_FLAGS] = {
.len = sizeof(u8),
.align = __alignof__(u8),
@@ -3118,8 +3215,10 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
};
static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = {
- [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING },
- [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING },
+ [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING,
+ .len = NFT_TABLE_MAXNAMELEN - 1 },
+ [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING,
+ .len = NFT_SET_MAXNAMELEN - 1 },
[NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED },
[NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 },
};
@@ -3173,6 +3272,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0)
goto nla_put_failure;
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
+ nla_put_string(skb, NFTA_SET_ELEM_OBJREF,
+ (*nft_set_ext_obj(ext))->name) < 0)
+ goto nla_put_failure;
+
if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
nla_put_be32(skb, NFTA_SET_ELEM_FLAGS,
htonl(*nft_set_ext_flags(ext))))
@@ -3224,9 +3328,9 @@ struct nft_set_dump_args {
};
static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
- const struct nft_set *set,
+ struct nft_set *set,
const struct nft_set_iter *iter,
- const struct nft_set_elem *elem)
+ struct nft_set_elem *elem)
{
struct nft_set_dump_args *args;
@@ -3238,7 +3342,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
u8 genmask = nft_genmask_cur(net);
- const struct nft_set *set;
+ struct nft_set *set;
struct nft_set_dump_args args;
struct nft_ctx ctx;
struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1];
@@ -3383,10 +3487,10 @@ nla_put_failure:
return -1;
}
-static int nf_tables_setelem_notify(const struct nft_ctx *ctx,
- const struct nft_set *set,
- const struct nft_set_elem *elem,
- int event, u16 flags)
+static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem,
+ int event, u16 flags)
{
struct net *net = ctx->net;
u32 portid = ctx->portid;
@@ -3394,9 +3498,8 @@ static int nf_tables_setelem_notify(const struct nft_ctx *ctx,
int err;
if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
- return 0;
+ return;
- err = -ENOBUFS;
skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
if (skb == NULL)
goto err;
@@ -3408,12 +3511,11 @@ static int nf_tables_setelem_notify(const struct nft_ctx *ctx,
goto err;
}
- err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report,
- GFP_KERNEL);
+ nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report,
+ GFP_KERNEL);
+ return;
err:
- if (err < 0)
- nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
- return err;
+ nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
}
static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx,
@@ -3467,7 +3569,8 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
nft_data_uninit(nft_set_ext_data(ext), set->dtype);
if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
-
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
+ (*nft_set_ext_obj(ext))->use--;
kfree(elem);
}
EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
@@ -3492,11 +3595,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr, u32 nlmsg_flags)
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
+ u8 genmask = nft_genmask_next(ctx->net);
struct nft_data_desc d1, d2;
struct nft_set_ext_tmpl tmpl;
struct nft_set_ext *ext, *ext2;
struct nft_set_elem elem;
struct nft_set_binding *binding;
+ struct nft_object *obj = NULL;
struct nft_userdata *udata;
struct nft_data data;
enum nft_registers dreg;
@@ -3559,6 +3664,20 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
}
+ if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
+ if (!(set->flags & NFT_SET_OBJECT)) {
+ err = -EINVAL;
+ goto err2;
+ }
+ obj = nf_tables_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF],
+ set->objtype, genmask);
+ if (IS_ERR(obj)) {
+ err = PTR_ERR(obj);
+ goto err2;
+ }
+ nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
+ }
+
if (nla[NFTA_SET_ELEM_DATA] != NULL) {
err = nft_data_init(ctx, &data, sizeof(data), &d2,
nla[NFTA_SET_ELEM_DATA]);
@@ -3617,6 +3736,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
udata->len = ulen - 1;
nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen);
}
+ if (obj) {
+ *nft_set_ext_obj(ext) = obj;
+ obj->use++;
+ }
trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
if (trans == NULL)
@@ -3626,10 +3749,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
err = set->ops->insert(ctx->net, set, &elem, &ext2);
if (err) {
if (err == -EEXIST) {
- if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
- nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
- memcmp(nft_set_ext_data(ext),
- nft_set_ext_data(ext2), set->dlen) != 0)
+ if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
+ nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
+ memcmp(nft_set_ext_data(ext),
+ nft_set_ext_data(ext2), set->dlen) != 0) ||
+ (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
+ nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) &&
+ *nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2)))
err = -EBUSY;
else if (!(nlmsg_flags & NLM_F_EXCL))
err = 0;
@@ -3637,10 +3763,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
goto err5;
}
+ if (set->size &&
+ !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) {
+ err = -ENFILE;
+ goto err6;
+ }
+
nft_trans_elem(trans) = elem;
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
return 0;
+err6:
+ set->ops->remove(ctx->net, set, &elem);
err5:
kfree(trans);
err4:
@@ -3687,15 +3821,9 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
return -EBUSY;
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
- if (set->size &&
- !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact))
- return -ENFILE;
-
err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags);
- if (err < 0) {
- atomic_dec(&set->nelems);
+ if (err < 0)
break;
- }
}
return err;
}
@@ -3779,6 +3907,35 @@ err1:
return err;
}
+static int nft_flush_set(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_set_elem *elem)
+{
+ struct nft_trans *trans;
+ int err;
+
+ trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM,
+ sizeof(struct nft_trans_elem), GFP_ATOMIC);
+ if (!trans)
+ return -ENOMEM;
+
+ if (!set->ops->flush(ctx->net, set, elem->priv)) {
+ err = -ENOENT;
+ goto err1;
+ }
+ set->ndeact++;
+
+ nft_trans_elem_set(trans) = set;
+ nft_trans_elem(trans) = *elem;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+ return 0;
+err1:
+ kfree(trans);
+ return err;
+}
+
static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
@@ -3789,9 +3946,6 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
struct nft_ctx ctx;
int rem, err = 0;
- if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
- return -EINVAL;
-
err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
if (err < 0)
return err;
@@ -3803,6 +3957,16 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
return -EBUSY;
+ if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) {
+ struct nft_set_iter iter = {
+ .genmask = genmask,
+ .fn = nft_flush_set,
+ };
+ set->ops->walk(&ctx, set, &iter);
+
+ return iter.err;
+ }
+
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
err = nft_del_setelem(&ctx, set, attr);
if (err < 0)
@@ -3838,6 +4002,496 @@ struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
}
EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc);
+/*
+ * Stateful objects
+ */
+
+/**
+ * nft_register_obj- register nf_tables stateful object type
+ * @obj: object type
+ *
+ * Registers the object type for use with nf_tables. Returns zero on
+ * success or a negative errno code otherwise.
+ */
+int nft_register_obj(struct nft_object_type *obj_type)
+{
+ if (obj_type->type == NFT_OBJECT_UNSPEC)
+ return -EINVAL;
+
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_add_rcu(&obj_type->list, &nf_tables_objects);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_obj);
+
+/**
+ * nft_unregister_obj - unregister nf_tables object type
+ * @obj: object type
+ *
+ * Unregisters the object type for use with nf_tables.
+ */
+void nft_unregister_obj(struct nft_object_type *obj_type)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_del_rcu(&obj_type->list);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_obj);
+
+struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
+ const struct nlattr *nla,
+ u32 objtype, u8 genmask)
+{
+ struct nft_object *obj;
+
+ list_for_each_entry(obj, &table->objects, list) {
+ if (!nla_strcmp(nla, obj->name) &&
+ objtype == obj->type->type &&
+ nft_active_genmask(obj, genmask))
+ return obj;
+ }
+ return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL_GPL(nf_tables_obj_lookup);
+
+static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
+ [NFTA_OBJ_TABLE] = { .type = NLA_STRING,
+ .len = NFT_TABLE_MAXNAMELEN - 1 },
+ [NFTA_OBJ_NAME] = { .type = NLA_STRING,
+ .len = NFT_OBJ_MAXNAMELEN - 1 },
+ [NFTA_OBJ_TYPE] = { .type = NLA_U32 },
+ [NFTA_OBJ_DATA] = { .type = NLA_NESTED },
+};
+
+static struct nft_object *nft_obj_init(const struct nft_object_type *type,
+ const struct nlattr *attr)
+{
+ struct nlattr *tb[type->maxattr + 1];
+ struct nft_object *obj;
+ int err;
+
+ if (attr) {
+ err = nla_parse_nested(tb, type->maxattr, attr, type->policy);
+ if (err < 0)
+ goto err1;
+ } else {
+ memset(tb, 0, sizeof(tb[0]) * (type->maxattr + 1));
+ }
+
+ err = -ENOMEM;
+ obj = kzalloc(sizeof(struct nft_object) + type->size, GFP_KERNEL);
+ if (obj == NULL)
+ goto err1;
+
+ err = type->init((const struct nlattr * const *)tb, obj);
+ if (err < 0)
+ goto err2;
+
+ obj->type = type;
+ return obj;
+err2:
+ kfree(obj);
+err1:
+ return ERR_PTR(err);
+}
+
+static int nft_object_dump(struct sk_buff *skb, unsigned int attr,
+ struct nft_object *obj, bool reset)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, attr);
+ if (!nest)
+ goto nla_put_failure;
+ if (obj->type->dump(skb, obj, reset) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nft_object_type *__nft_obj_type_get(u32 objtype)
+{
+ const struct nft_object_type *type;
+
+ list_for_each_entry(type, &nf_tables_objects, list) {
+ if (objtype == type->type)
+ return type;
+ }
+ return NULL;
+}
+
+static const struct nft_object_type *nft_obj_type_get(u32 objtype)
+{
+ const struct nft_object_type *type;
+
+ type = __nft_obj_type_get(objtype);
+ if (type != NULL && try_module_get(type->owner))
+ return type;
+
+#ifdef CONFIG_MODULES
+ if (type == NULL) {
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-obj-%u", objtype);
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (__nft_obj_type_get(objtype))
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ return ERR_PTR(-ENOENT);
+}
+
+static int nf_tables_newobj(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nft_object_type *type;
+ u8 genmask = nft_genmask_next(net);
+ int family = nfmsg->nfgen_family;
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct nft_object *obj;
+ struct nft_ctx ctx;
+ u32 objtype;
+ int err;
+
+ if (!nla[NFTA_OBJ_TYPE] ||
+ !nla[NFTA_OBJ_NAME] ||
+ !nla[NFTA_OBJ_DATA])
+ return -EINVAL;
+
+ afi = nf_tables_afinfo_lookup(net, family, true);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+ obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+ if (IS_ERR(obj)) {
+ err = PTR_ERR(obj);
+ if (err != -ENOENT)
+ return err;
+
+ } else {
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+
+ return 0;
+ }
+
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+
+ type = nft_obj_type_get(objtype);
+ if (IS_ERR(type))
+ return PTR_ERR(type);
+
+ obj = nft_obj_init(type, nla[NFTA_OBJ_DATA]);
+ if (IS_ERR(obj)) {
+ err = PTR_ERR(obj);
+ goto err1;
+ }
+ obj->table = table;
+ nla_strlcpy(obj->name, nla[NFTA_OBJ_NAME], NFT_OBJ_MAXNAMELEN);
+
+ err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj);
+ if (err < 0)
+ goto err2;
+
+ list_add_tail_rcu(&obj->list, &table->objects);
+ table->use++;
+ return 0;
+err2:
+ if (obj->type->destroy)
+ obj->type->destroy(obj);
+ kfree(obj);
+err1:
+ module_put(type->owner);
+ return err;
+}
+
+static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
+ u32 portid, u32 seq, int event, u32 flags,
+ int family, const struct nft_table *table,
+ struct nft_object *obj, bool reset)
+{
+ struct nfgenmsg *nfmsg;
+ struct nlmsghdr *nlh;
+
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
+
+ if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) ||
+ nla_put_string(skb, NFTA_OBJ_NAME, obj->name) ||
+ nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->type->type)) ||
+ nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
+ nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+struct nft_obj_filter {
+ char table[NFT_OBJ_MAXNAMELEN];
+ u32 type;
+};
+
+static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ unsigned int idx = 0, s_idx = cb->args[0];
+ struct nft_obj_filter *filter = cb->data;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ struct nft_object *obj;
+ bool reset = false;
+
+ if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
+ reset = true;
+
+ rcu_read_lock();
+ cb->seq = net->nft.base_seq;
+
+ list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+ if (family != NFPROTO_UNSPEC && family != afi->family)
+ continue;
+
+ list_for_each_entry_rcu(table, &afi->tables, list) {
+ list_for_each_entry_rcu(obj, &table->objects, list) {
+ if (!nft_is_active(net, obj))
+ goto cont;
+ if (idx < s_idx)
+ goto cont;
+ if (idx > s_idx)
+ memset(&cb->args[1], 0,
+ sizeof(cb->args) - sizeof(cb->args[0]));
+ if (filter && filter->table[0] &&
+ strcmp(filter->table, table->name))
+ goto cont;
+ if (filter &&
+ filter->type != NFT_OBJECT_UNSPEC &&
+ obj->type->type != filter->type)
+ goto cont;
+
+ if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFT_MSG_NEWOBJ,
+ NLM_F_MULTI | NLM_F_APPEND,
+ afi->family, table, obj, reset) < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+ }
+done:
+ rcu_read_unlock();
+
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static int nf_tables_dump_obj_done(struct netlink_callback *cb)
+{
+ kfree(cb->data);
+
+ return 0;
+}
+
+static struct nft_obj_filter *
+nft_obj_filter_alloc(const struct nlattr * const nla[])
+{
+ struct nft_obj_filter *filter;
+
+ filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ if (!filter)
+ return ERR_PTR(-ENOMEM);
+
+ if (nla[NFTA_OBJ_TABLE])
+ nla_strlcpy(filter->table, nla[NFTA_OBJ_TABLE],
+ NFT_TABLE_MAXNAMELEN);
+ if (nla[NFTA_OBJ_TYPE])
+ filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+
+ return filter;
+}
+
+static int nf_tables_getobj(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u8 genmask = nft_genmask_cur(net);
+ int family = nfmsg->nfgen_family;
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ struct nft_object *obj;
+ struct sk_buff *skb2;
+ bool reset = false;
+ u32 objtype;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_obj,
+ .done = nf_tables_dump_obj_done,
+ };
+
+ if (nla[NFTA_OBJ_TABLE] ||
+ nla[NFTA_OBJ_TYPE]) {
+ struct nft_obj_filter *filter;
+
+ filter = nft_obj_filter_alloc(nla);
+ if (IS_ERR(filter))
+ return -ENOMEM;
+
+ c.data = filter;
+ }
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+
+ if (!nla[NFTA_OBJ_NAME] ||
+ !nla[NFTA_OBJ_TYPE])
+ return -EINVAL;
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+ obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb2)
+ return -ENOMEM;
+
+ if (NFNL_MSG_TYPE(nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
+ reset = true;
+
+ err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
+ family, table, obj, reset);
+ if (err < 0)
+ goto err;
+
+ return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+err:
+ kfree_skb(skb2);
+ return err;
+
+ return 0;
+}
+
+static void nft_obj_destroy(struct nft_object *obj)
+{
+ if (obj->type->destroy)
+ obj->type->destroy(obj);
+
+ module_put(obj->type->owner);
+ kfree(obj);
+}
+
+static int nf_tables_delobj(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u8 genmask = nft_genmask_next(net);
+ int family = nfmsg->nfgen_family;
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct nft_object *obj;
+ struct nft_ctx ctx;
+ u32 objtype;
+
+ if (!nla[NFTA_OBJ_TYPE] ||
+ !nla[NFTA_OBJ_NAME])
+ return -EINVAL;
+
+ afi = nf_tables_afinfo_lookup(net, family, true);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+ obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+ if (obj->use > 0)
+ return -EBUSY;
+
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+
+ return nft_delobj(&ctx, obj);
+}
+
+void nft_obj_notify(struct net *net, struct nft_table *table,
+ struct nft_object *obj, u32 portid, u32 seq, int event,
+ int family, int report, gfp_t gfp)
+{
+ struct sk_buff *skb;
+ int err;
+
+ if (!report &&
+ !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
+ return;
+
+ skb = nlmsg_new(NLMSG_GOODSIZE, gfp);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_obj_info(skb, net, portid, seq, event, 0, family,
+ table, obj, false);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp);
+ return;
+err:
+ nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
+}
+EXPORT_SYMBOL_GPL(nft_obj_notify);
+
+static void nf_tables_obj_notify(const struct nft_ctx *ctx,
+ struct nft_object *obj, int event)
+{
+ nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event,
+ ctx->afi->family, ctx->report, GFP_KERNEL);
+}
+
static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
u32 portid, u32 seq)
{
@@ -3865,7 +4519,8 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event)
+static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb,
+ int event)
{
struct nlmsghdr *nlh = nlmsg_hdr(skb);
struct sk_buff *skb2;
@@ -3873,9 +4528,8 @@ static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event)
if (nlmsg_report(nlh) &&
!nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
- return 0;
+ return;
- err = -ENOBUFS;
skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
if (skb2 == NULL)
goto err;
@@ -3887,14 +4541,12 @@ static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event)
goto err;
}
- err = nfnetlink_send(skb2, net, NETLINK_CB(skb).portid,
- NFNLGRP_NFTABLES, nlmsg_report(nlh), GFP_KERNEL);
+ nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES,
+ nlmsg_report(nlh), GFP_KERNEL);
+ return;
err:
- if (err < 0) {
- nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES,
- err);
- }
- return err;
+ nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES,
+ -ENOBUFS);
}
static int nf_tables_getgen(struct net *net, struct sock *nlsk,
@@ -3998,6 +4650,26 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
[NFT_MSG_GETGEN] = {
.call = nf_tables_getgen,
},
+ [NFT_MSG_NEWOBJ] = {
+ .call_batch = nf_tables_newobj,
+ .attr_count = NFTA_OBJ_MAX,
+ .policy = nft_obj_policy,
+ },
+ [NFT_MSG_GETOBJ] = {
+ .call = nf_tables_getobj,
+ .attr_count = NFTA_OBJ_MAX,
+ .policy = nft_obj_policy,
+ },
+ [NFT_MSG_DELOBJ] = {
+ .call_batch = nf_tables_delobj,
+ .attr_count = NFTA_OBJ_MAX,
+ .policy = nft_obj_policy,
+ },
+ [NFT_MSG_GETOBJ_RESET] = {
+ .call = nf_tables_getobj,
+ .attr_count = NFTA_OBJ_MAX,
+ .policy = nft_obj_policy,
+ },
};
static void nft_chain_commit_update(struct nft_trans *trans)
@@ -4040,6 +4712,9 @@ static void nf_tables_commit_release(struct nft_trans *trans)
nft_set_elem_destroy(nft_trans_elem_set(trans),
nft_trans_elem(trans).priv, true);
break;
+ case NFT_MSG_DELOBJ:
+ nft_obj_destroy(nft_trans_obj(trans));
+ break;
}
kfree(trans);
}
@@ -4143,10 +4818,21 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nf_tables_setelem_notify(&trans->ctx, te->set,
&te->elem,
NFT_MSG_DELSETELEM, 0);
- te->set->ops->remove(te->set, &te->elem);
+ te->set->ops->remove(net, te->set, &te->elem);
atomic_dec(&te->set->nelems);
te->set->ndeact--;
break;
+ case NFT_MSG_NEWOBJ:
+ nft_clear(net, nft_trans_obj(trans));
+ nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
+ NFT_MSG_NEWOBJ);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELOBJ:
+ list_del_rcu(&nft_trans_obj(trans)->list);
+ nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
+ NFT_MSG_DELOBJ);
+ break;
}
}
@@ -4181,6 +4867,9 @@ static void nf_tables_abort_release(struct nft_trans *trans)
nft_set_elem_destroy(nft_trans_elem_set(trans),
nft_trans_elem(trans).priv, true);
break;
+ case NFT_MSG_NEWOBJ:
+ nft_obj_destroy(nft_trans_obj(trans));
+ break;
}
kfree(trans);
}
@@ -4250,7 +4939,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
case NFT_MSG_NEWSETELEM:
te = (struct nft_trans_elem *)trans->data;
- te->set->ops->remove(te->set, &te->elem);
+ te->set->ops->remove(net, te->set, &te->elem);
atomic_dec(&te->set->nelems);
break;
case NFT_MSG_DELSETELEM:
@@ -4261,6 +4950,15 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
nft_trans_destroy(trans);
break;
+ case NFT_MSG_NEWOBJ:
+ trans->ctx.table->use--;
+ list_del_rcu(&nft_trans_obj(trans)->list);
+ break;
+ case NFT_MSG_DELOBJ:
+ trans->ctx.table->use++;
+ nft_clear(trans->ctx.net, nft_trans_obj(trans));
+ nft_trans_destroy(trans);
+ break;
}
}
@@ -4275,6 +4973,11 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
return 0;
}
+static bool nf_tables_valid_genid(struct net *net, u32 genid)
+{
+ return net->nft.base_seq == genid;
+}
+
static const struct nfnetlink_subsystem nf_tables_subsys = {
.name = "nf_tables",
.subsys_id = NFNL_SUBSYS_NFTABLES,
@@ -4282,6 +4985,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
.cb = nf_tables_cb,
.commit = nf_tables_commit,
.abort = nf_tables_abort,
+ .valid_genid = nf_tables_valid_genid,
};
int nft_chain_validate_dependency(const struct nft_chain *chain,
@@ -4329,9 +5033,9 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
const struct nft_chain *chain);
static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
- const struct nft_set *set,
+ struct nft_set *set,
const struct nft_set_iter *iter,
- const struct nft_set_elem *elem)
+ struct nft_set_elem *elem)
{
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
const struct nft_data *data;
@@ -4355,7 +5059,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
{
const struct nft_rule *rule;
const struct nft_expr *expr, *last;
- const struct nft_set *set;
+ struct nft_set *set;
struct nft_set_binding *binding;
struct nft_set_iter iter;
@@ -4807,6 +5511,7 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
{
struct nft_table *table, *nt;
struct nft_chain *chain, *nc;
+ struct nft_object *obj, *ne;
struct nft_rule *rule, *nr;
struct nft_set *set, *ns;
struct nft_ctx ctx = {
@@ -4833,6 +5538,11 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
table->use--;
nft_set_destroy(set);
}
+ list_for_each_entry_safe(obj, ne, &table->objects, list) {
+ list_del(&obj->list);
+ table->use--;
+ nft_obj_destroy(obj);
+ }
list_for_each_entry_safe(chain, nc, &table->chains, list) {
list_del(&chain->list);
table->use--;
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 0dd5c695482f..65dbeadcb118 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -53,10 +53,10 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info,
nft_trace_notify(info);
- nf_log_trace(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in,
- pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ",
- chain->table->name, chain->name, comments[type],
- rulenum);
+ nf_log_trace(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
+ nft_in(pkt), nft_out(pkt), &trace_loginfo,
+ "TRACE: %s:%s:%s:%u ",
+ chain->table->name, chain->name, comments[type], rulenum);
}
static inline void nft_trace_packet(struct nft_traceinfo *info,
@@ -124,7 +124,7 @@ unsigned int
nft_do_chain(struct nft_pktinfo *pkt, void *priv)
{
const struct nft_chain *chain = priv, *basechain = chain;
- const struct net *net = pkt->net;
+ const struct net *net = nft_net(pkt);
const struct nft_rule *rule;
const struct nft_expr *expr, *last;
struct nft_regs regs;
@@ -178,6 +178,7 @@ next_rule:
case NF_ACCEPT:
case NF_DROP:
case NF_QUEUE:
+ case NF_STOLEN:
nft_trace_packet(&info, chain, rule,
rulenum, NFT_TRACETYPE_RULE);
return regs.verdict.code;
@@ -231,68 +232,40 @@ next_rule:
}
EXPORT_SYMBOL_GPL(nft_do_chain);
+static struct nft_expr_type *nft_basic_types[] = {
+ &nft_imm_type,
+ &nft_cmp_type,
+ &nft_lookup_type,
+ &nft_bitwise_type,
+ &nft_byteorder_type,
+ &nft_payload_type,
+ &nft_dynset_type,
+ &nft_range_type,
+};
+
int __init nf_tables_core_module_init(void)
{
- int err;
-
- err = nft_immediate_module_init();
- if (err < 0)
- goto err1;
-
- err = nft_cmp_module_init();
- if (err < 0)
- goto err2;
-
- err = nft_lookup_module_init();
- if (err < 0)
- goto err3;
-
- err = nft_bitwise_module_init();
- if (err < 0)
- goto err4;
+ int err, i;
- err = nft_byteorder_module_init();
- if (err < 0)
- goto err5;
-
- err = nft_payload_module_init();
- if (err < 0)
- goto err6;
-
- err = nft_dynset_module_init();
- if (err < 0)
- goto err7;
-
- err = nft_range_module_init();
- if (err < 0)
- goto err8;
+ for (i = 0; i < ARRAY_SIZE(nft_basic_types); i++) {
+ err = nft_register_expr(nft_basic_types[i]);
+ if (err)
+ goto err;
+ }
return 0;
-err8:
- nft_dynset_module_exit();
-err7:
- nft_payload_module_exit();
-err6:
- nft_byteorder_module_exit();
-err5:
- nft_bitwise_module_exit();
-err4:
- nft_lookup_module_exit();
-err3:
- nft_cmp_module_exit();
-err2:
- nft_immediate_module_exit();
-err1:
+
+err:
+ while (i-- > 0)
+ nft_unregister_expr(nft_basic_types[i]);
return err;
}
void nf_tables_core_module_exit(void)
{
- nft_dynset_module_exit();
- nft_payload_module_exit();
- nft_byteorder_module_exit();
- nft_bitwise_module_exit();
- nft_lookup_module_exit();
- nft_cmp_module_exit();
- nft_immediate_module_exit();
+ int i;
+
+ i = ARRAY_SIZE(nft_basic_types);
+ while (i-- > 0)
+ nft_unregister_expr(nft_basic_types[i]);
}
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index ab695f8e2d29..12eb9041dca2 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -171,7 +171,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
unsigned int size;
int event = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_TRACE;
- if (!nfnetlink_has_listeners(pkt->net, NFNLGRP_NFTRACE))
+ if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE))
return;
size = nlmsg_total_size(sizeof(struct nfgenmsg)) +
@@ -207,7 +207,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = 0;
- if (nla_put_be32(skb, NFTA_TRACE_NFPROTO, htonl(pkt->pf)))
+ if (nla_put_be32(skb, NFTA_TRACE_NFPROTO, htonl(nft_pf(pkt))))
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_TRACE_TYPE, htonl(info->type)))
@@ -249,7 +249,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
goto nla_put_failure;
if (!info->packet_dumped) {
- if (nf_trace_fill_dev_info(skb, pkt->in, pkt->out))
+ if (nf_trace_fill_dev_info(skb, nft_in(pkt), nft_out(pkt)))
goto nla_put_failure;
if (nf_trace_fill_pkt_info(skb, pkt))
@@ -258,7 +258,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
}
nlmsg_end(skb, nlh);
- nfnetlink_send(skb, pkt->net, 0, NFNLGRP_NFTRACE, 0, GFP_ATOMIC);
+ nfnetlink_send(skb, nft_net(pkt), 0, NFNLGRP_NFTRACE, 0, GFP_ATOMIC);
return;
nla_put_failure:
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 2278d9ab723b..68eda920160e 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -3,7 +3,7 @@
*
* (C) 2001 by Jay Schulist <jschlst@samba.org>,
* (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
- * (C) 2005,2007 by Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2005-2017 by Pablo Neira Ayuso <pablo@netfilter.org>
*
* Initial netfilter messages via netlink development funded and
* generally made possible by Network Robots, Inc. (www.networkrobots.com)
@@ -22,7 +22,7 @@
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/skbuff.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/sock.h>
#include <linux/init.h>
@@ -100,9 +100,9 @@ int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n)
}
EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);
-static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type)
+static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u16 type)
{
- u_int8_t subsys_id = NFNL_SUBSYS_ID(type);
+ u8 subsys_id = NFNL_SUBSYS_ID(type);
if (subsys_id >= NFNL_SUBSYS_COUNT)
return NULL;
@@ -111,9 +111,9 @@ static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t t
}
static inline const struct nfnl_callback *
-nfnetlink_find_client(u_int16_t type, const struct nfnetlink_subsystem *ss)
+nfnetlink_find_client(u16 type, const struct nfnetlink_subsystem *ss)
{
- u_int8_t cb_id = NFNL_MSG_TYPE(type);
+ u8 cb_id = NFNL_MSG_TYPE(type);
if (cb_id >= ss->cb_count)
return NULL;
@@ -185,7 +185,7 @@ replay:
{
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
- u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+ u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
struct nlattr *attr = (void *)nlh + min_len;
int attrlen = nlh->nlmsg_len - min_len;
@@ -273,13 +273,13 @@ enum {
};
static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
- u_int16_t subsys_id)
+ u16 subsys_id, u32 genid)
{
struct sk_buff *oskb = skb;
struct net *net = sock_net(skb->sk);
const struct nfnetlink_subsystem *ss;
const struct nfnl_callback *nc;
- static LIST_HEAD(err_list);
+ LIST_HEAD(err_list);
u32 status;
int err;
@@ -315,6 +315,12 @@ replay:
return kfree_skb(skb);
}
+ if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) {
+ nfnl_unlock(subsys_id);
+ netlink_ack(oskb, nlh, -ERESTART);
+ return kfree_skb(skb);
+ }
+
while (skb->len >= nlmsg_total_size(0)) {
int msglen, type;
@@ -365,7 +371,7 @@ replay:
{
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
- u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+ u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
struct nlattr *attr = (void *)nlh + min_len;
int attrlen = nlh->nlmsg_len - min_len;
@@ -436,11 +442,51 @@ done:
kfree_skb(skb);
}
+static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = {
+ [NFNL_BATCH_GENID] = { .type = NLA_U32 },
+};
+
+static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+ struct nlattr *attr = (void *)nlh + min_len;
+ struct nlattr *cda[NFNL_BATCH_MAX + 1];
+ int attrlen = nlh->nlmsg_len - min_len;
+ struct nfgenmsg *nfgenmsg;
+ int msglen, err;
+ u32 gen_id = 0;
+ u16 res_id;
+
+ msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (msglen > skb->len)
+ msglen = skb->len;
+
+ if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+ skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
+ return;
+
+ err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy);
+ if (err < 0) {
+ netlink_ack(skb, nlh, err);
+ return;
+ }
+ if (cda[NFNL_BATCH_GENID])
+ gen_id = ntohl(nla_get_be32(cda[NFNL_BATCH_GENID]));
+
+ nfgenmsg = nlmsg_data(nlh);
+ skb_pull(skb, msglen);
+ /* Work around old nft using host byte order */
+ if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
+ res_id = NFNL_SUBSYS_NFTABLES;
+ else
+ res_id = ntohs(nfgenmsg->res_id);
+
+ nfnetlink_rcv_batch(skb, nlh, res_id, gen_id);
+}
+
static void nfnetlink_rcv(struct sk_buff *skb)
{
struct nlmsghdr *nlh = nlmsg_hdr(skb);
- u_int16_t res_id;
- int msglen;
if (nlh->nlmsg_len < NLMSG_HDRLEN ||
skb->len < nlh->nlmsg_len)
@@ -451,28 +497,10 @@ static void nfnetlink_rcv(struct sk_buff *skb)
return;
}
- if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) {
- struct nfgenmsg *nfgenmsg;
-
- msglen = NLMSG_ALIGN(nlh->nlmsg_len);
- if (msglen > skb->len)
- msglen = skb->len;
-
- if (nlh->nlmsg_len < NLMSG_HDRLEN ||
- skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
- return;
-
- nfgenmsg = nlmsg_data(nlh);
- skb_pull(skb, msglen);
- /* Work around old nft using host byte order */
- if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
- res_id = NFNL_SUBSYS_NFTABLES;
- else
- res_id = ntohs(nfgenmsg->res_id);
- nfnetlink_rcv_batch(skb, nlh, res_id);
- } else {
+ if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN)
+ nfnetlink_rcv_skb_batch(skb, nlh);
+ else
netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
- }
}
#ifdef CONFIG_MODULES
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 3b79f34b5095..d45558178da5 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -32,6 +32,13 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_DESCRIPTION("nfnl_cthelper: User-space connection tracking helpers");
+struct nfnl_cthelper {
+ struct list_head list;
+ struct nf_conntrack_helper helper;
+};
+
+static LIST_HEAD(nfnl_cthelper_list);
+
static int
nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
struct nf_conn *ct, enum ip_conntrack_info ctinfo)
@@ -48,7 +55,7 @@ nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
if (helper == NULL)
return NF_DROP;
- /* This is an user-space helper not yet configured, skip. */
+ /* This is a user-space helper not yet configured, skip. */
if ((helper->flags &
(NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) ==
NF_CT_HELPER_F_USERSPACE)
@@ -161,6 +168,7 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
int i, ret;
struct nf_conntrack_expect_policy *expect_policy;
struct nlattr *tb[NFCTH_POLICY_SET_MAX+1];
+ unsigned int class_max;
ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
nfnl_cthelper_expect_policy_set);
@@ -170,19 +178,18 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
if (!tb[NFCTH_POLICY_SET_NUM])
return -EINVAL;
- helper->expect_class_max =
- ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM]));
-
- if (helper->expect_class_max != 0 &&
- helper->expect_class_max > NF_CT_MAX_EXPECT_CLASSES)
+ class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM]));
+ if (class_max == 0)
+ return -EINVAL;
+ if (class_max > NF_CT_MAX_EXPECT_CLASSES)
return -EOVERFLOW;
expect_policy = kzalloc(sizeof(struct nf_conntrack_expect_policy) *
- helper->expect_class_max, GFP_KERNEL);
+ class_max, GFP_KERNEL);
if (expect_policy == NULL)
return -ENOMEM;
- for (i=0; i<helper->expect_class_max; i++) {
+ for (i = 0; i < class_max; i++) {
if (!tb[NFCTH_POLICY_SET+i])
goto err;
@@ -191,6 +198,8 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
if (ret < 0)
goto err;
}
+
+ helper->expect_class_max = class_max - 1;
helper->expect_policy = expect_policy;
return 0;
err:
@@ -203,18 +212,20 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
struct nf_conntrack_tuple *tuple)
{
struct nf_conntrack_helper *helper;
+ struct nfnl_cthelper *nfcth;
int ret;
if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN])
return -EINVAL;
- helper = kzalloc(sizeof(struct nf_conntrack_helper), GFP_KERNEL);
- if (helper == NULL)
+ nfcth = kzalloc(sizeof(*nfcth), GFP_KERNEL);
+ if (nfcth == NULL)
return -ENOMEM;
+ helper = &nfcth->helper;
ret = nfnl_cthelper_parse_expect_policy(helper, tb[NFCTH_POLICY]);
if (ret < 0)
- goto err;
+ goto err1;
strncpy(helper->name, nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN);
helper->data_len = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
@@ -245,15 +256,101 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
ret = nf_conntrack_helper_register(helper);
if (ret < 0)
- goto err;
+ goto err2;
+ list_add_tail(&nfcth->list, &nfnl_cthelper_list);
return 0;
-err:
- kfree(helper);
+err2:
+ kfree(helper->expect_policy);
+err1:
+ kfree(nfcth);
return ret;
}
static int
+nfnl_cthelper_update_policy_one(const struct nf_conntrack_expect_policy *policy,
+ struct nf_conntrack_expect_policy *new_policy,
+ const struct nlattr *attr)
+{
+ struct nlattr *tb[NFCTH_POLICY_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr,
+ nfnl_cthelper_expect_pol);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFCTH_POLICY_NAME] ||
+ !tb[NFCTH_POLICY_EXPECT_MAX] ||
+ !tb[NFCTH_POLICY_EXPECT_TIMEOUT])
+ return -EINVAL;
+
+ if (nla_strcmp(tb[NFCTH_POLICY_NAME], policy->name))
+ return -EBUSY;
+
+ new_policy->max_expected =
+ ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
+ new_policy->timeout =
+ ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT]));
+
+ return 0;
+}
+
+static int nfnl_cthelper_update_policy_all(struct nlattr *tb[],
+ struct nf_conntrack_helper *helper)
+{
+ struct nf_conntrack_expect_policy new_policy[helper->expect_class_max + 1];
+ struct nf_conntrack_expect_policy *policy;
+ int i, err;
+
+ /* Check first that all policy attributes are well-formed, so we don't
+ * leave things in inconsistent state on errors.
+ */
+ for (i = 0; i < helper->expect_class_max + 1; i++) {
+
+ if (!tb[NFCTH_POLICY_SET + i])
+ return -EINVAL;
+
+ err = nfnl_cthelper_update_policy_one(&helper->expect_policy[i],
+ &new_policy[i],
+ tb[NFCTH_POLICY_SET + i]);
+ if (err < 0)
+ return err;
+ }
+ /* Now we can safely update them. */
+ for (i = 0; i < helper->expect_class_max + 1; i++) {
+ policy = (struct nf_conntrack_expect_policy *)
+ &helper->expect_policy[i];
+ policy->max_expected = new_policy->max_expected;
+ policy->timeout = new_policy->timeout;
+ }
+
+ return 0;
+}
+
+static int nfnl_cthelper_update_policy(struct nf_conntrack_helper *helper,
+ const struct nlattr *attr)
+{
+ struct nlattr *tb[NFCTH_POLICY_SET_MAX + 1];
+ unsigned int class_max;
+ int err;
+
+ err = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
+ nfnl_cthelper_expect_policy_set);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFCTH_POLICY_SET_NUM])
+ return -EINVAL;
+
+ class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM]));
+ if (helper->expect_class_max + 1 != class_max)
+ return -EBUSY;
+
+ return nfnl_cthelper_update_policy_all(tb, helper);
+}
+
+static int
nfnl_cthelper_update(const struct nlattr * const tb[],
struct nf_conntrack_helper *helper)
{
@@ -263,8 +360,7 @@ nfnl_cthelper_update(const struct nlattr * const tb[],
return -EBUSY;
if (tb[NFCTH_POLICY]) {
- ret = nfnl_cthelper_parse_expect_policy(helper,
- tb[NFCTH_POLICY]);
+ ret = nfnl_cthelper_update_policy(helper, tb[NFCTH_POLICY]);
if (ret < 0)
return ret;
}
@@ -293,7 +389,8 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
const char *helper_name;
struct nf_conntrack_helper *cur, *helper = NULL;
struct nf_conntrack_tuple tuple;
- int ret = 0, i;
+ struct nfnl_cthelper *nlcth;
+ int ret = 0;
if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE])
return -EINVAL;
@@ -304,31 +401,22 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
if (ret < 0)
return ret;
- rcu_read_lock();
- for (i = 0; i < nf_ct_helper_hsize && !helper; i++) {
- hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) {
+ list_for_each_entry(nlcth, &nfnl_cthelper_list, list) {
+ cur = &nlcth->helper;
- /* skip non-userspace conntrack helpers. */
- if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
- continue;
+ if (strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN))
+ continue;
- if (strncmp(cur->name, helper_name,
- NF_CT_HELPER_NAME_LEN) != 0)
- continue;
+ if ((tuple.src.l3num != cur->tuple.src.l3num ||
+ tuple.dst.protonum != cur->tuple.dst.protonum))
+ continue;
- if ((tuple.src.l3num != cur->tuple.src.l3num ||
- tuple.dst.protonum != cur->tuple.dst.protonum))
- continue;
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
- ret = -EEXIST;
- goto err;
- }
- helper = cur;
- break;
- }
+ helper = cur;
+ break;
}
- rcu_read_unlock();
if (helper == NULL)
ret = nfnl_cthelper_create(tb, &tuple);
@@ -336,9 +424,6 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
ret = nfnl_cthelper_update(tb, helper);
return ret;
-err:
- rcu_read_unlock();
- return ret;
}
static int
@@ -377,10 +462,10 @@ nfnl_cthelper_dump_policy(struct sk_buff *skb,
goto nla_put_failure;
if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM,
- htonl(helper->expect_class_max)))
+ htonl(helper->expect_class_max + 1)))
goto nla_put_failure;
- for (i=0; i<helper->expect_class_max; i++) {
+ for (i = 0; i < helper->expect_class_max + 1; i++) {
nest_parms2 = nla_nest_start(skb,
(NFCTH_POLICY_SET+i) | NLA_F_NESTED);
if (nest_parms2 == NULL)
@@ -502,11 +587,12 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const tb[])
{
- int ret = -ENOENT, i;
+ int ret = -ENOENT;
struct nf_conntrack_helper *cur;
struct sk_buff *skb2;
char *helper_name = NULL;
struct nf_conntrack_tuple tuple;
+ struct nfnl_cthelper *nlcth;
bool tuple_set = false;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
@@ -527,45 +613,39 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
tuple_set = true;
}
- for (i = 0; i < nf_ct_helper_hsize; i++) {
- hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) {
+ list_for_each_entry(nlcth, &nfnl_cthelper_list, list) {
+ cur = &nlcth->helper;
+ if (helper_name &&
+ strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN))
+ continue;
- /* skip non-userspace conntrack helpers. */
- if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
- continue;
+ if (tuple_set &&
+ (tuple.src.l3num != cur->tuple.src.l3num ||
+ tuple.dst.protonum != cur->tuple.dst.protonum))
+ continue;
- if (helper_name && strncmp(cur->name, helper_name,
- NF_CT_HELPER_NAME_LEN) != 0) {
- continue;
- }
- if (tuple_set &&
- (tuple.src.l3num != cur->tuple.src.l3num ||
- tuple.dst.protonum != cur->tuple.dst.protonum))
- continue;
-
- skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL) {
- ret = -ENOMEM;
- break;
- }
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL) {
+ ret = -ENOMEM;
+ break;
+ }
- ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
- NFNL_MSG_CTHELPER_NEW, cur);
- if (ret <= 0) {
- kfree_skb(skb2);
- break;
- }
+ ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ NFNL_MSG_CTHELPER_NEW, cur);
+ if (ret <= 0) {
+ kfree_skb(skb2);
+ break;
+ }
- ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
- MSG_DONTWAIT);
- if (ret > 0)
- ret = 0;
+ ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
+ MSG_DONTWAIT);
+ if (ret > 0)
+ ret = 0;
- /* this avoids a loop in nfnetlink. */
- return ret == -EAGAIN ? -ENOBUFS : ret;
- }
+ /* this avoids a loop in nfnetlink. */
+ return ret == -EAGAIN ? -ENOBUFS : ret;
}
return ret;
}
@@ -576,10 +656,10 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
{
char *helper_name = NULL;
struct nf_conntrack_helper *cur;
- struct hlist_node *tmp;
struct nf_conntrack_tuple tuple;
bool tuple_set = false, found = false;
- int i, j = 0, ret;
+ struct nfnl_cthelper *nlcth, *n;
+ int j = 0, ret;
if (tb[NFCTH_NAME])
helper_name = nla_data(tb[NFCTH_NAME]);
@@ -592,28 +672,27 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
tuple_set = true;
}
- for (i = 0; i < nf_ct_helper_hsize; i++) {
- hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i],
- hnode) {
- /* skip non-userspace conntrack helpers. */
- if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
- continue;
+ list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) {
+ cur = &nlcth->helper;
+ j++;
- j++;
+ if (helper_name &&
+ strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN))
+ continue;
- if (helper_name && strncmp(cur->name, helper_name,
- NF_CT_HELPER_NAME_LEN) != 0) {
- continue;
- }
- if (tuple_set &&
- (tuple.src.l3num != cur->tuple.src.l3num ||
- tuple.dst.protonum != cur->tuple.dst.protonum))
- continue;
+ if (tuple_set &&
+ (tuple.src.l3num != cur->tuple.src.l3num ||
+ tuple.dst.protonum != cur->tuple.dst.protonum))
+ continue;
- found = true;
- nf_conntrack_helper_unregister(cur);
- }
+ found = true;
+ nf_conntrack_helper_unregister(cur);
+ kfree(cur->expect_policy);
+
+ list_del(&nlcth->list);
+ kfree(nlcth);
}
+
/* Make sure we return success if we flush and there is no helpers */
return (found || j == 0) ? 0 : -ENOENT;
}
@@ -662,20 +741,16 @@ err_out:
static void __exit nfnl_cthelper_exit(void)
{
struct nf_conntrack_helper *cur;
- struct hlist_node *tmp;
- int i;
+ struct nfnl_cthelper *nlcth, *n;
nfnetlink_subsys_unregister(&nfnl_cthelper_subsys);
- for (i=0; i<nf_ct_helper_hsize; i++) {
- hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i],
- hnode) {
- /* skip non-userspace conntrack helpers. */
- if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
- continue;
+ list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) {
+ cur = &nlcth->helper;
- nf_conntrack_helper_unregister(cur);
- }
+ nf_conntrack_helper_unregister(cur);
+ kfree(cur->expect_policy);
+ kfree(nlcth);
}
}
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 139e0867e56e..47d6656c9119 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -646,8 +646,8 @@ static void __exit cttimeout_exit(void)
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
+ synchronize_rcu();
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
- rcu_barrier();
}
module_init(cttimeout_init);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index eb086a192c5a..08247bf7d7b8 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -80,7 +80,7 @@ struct nfulnl_instance {
#define INSTANCE_BUCKETS 16
-static int nfnl_log_net_id __read_mostly;
+static unsigned int nfnl_log_net_id __read_mostly;
struct nfnl_log_net {
spinlock_t instances_lock;
@@ -330,7 +330,7 @@ nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size,
* message. WARNING: has to be <= 128k due to slab restrictions */
n = max(inst_size, pkt_size);
- skb = alloc_skb(n, GFP_ATOMIC);
+ skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN);
if (!skb) {
if (n > pkt_size) {
/* try to allocate only as much as we need for current
@@ -538,7 +538,7 @@ __build_packet_message(struct nfnl_log_net *log,
goto nla_put_failure;
}
- if (skb->tstamp.tv64) {
+ if (skb->tstamp) {
struct nfulnl_msg_packet_timestamp ts;
struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
ts.sec = cpu_to_be64(kts.tv_sec);
@@ -1152,6 +1152,7 @@ MODULE_ALIAS_NF_LOGGER(AF_INET, 1);
MODULE_ALIAS_NF_LOGGER(AF_INET6, 1);
MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 1);
MODULE_ALIAS_NF_LOGGER(3, 1); /* NFPROTO_ARP */
+MODULE_ALIAS_NF_LOGGER(5, 1); /* NFPROTO_NETDEV */
module_init(nfnetlink_log_init);
module_exit(nfnetlink_log_fini);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index af832c526048..933509ebf3d3 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -69,7 +69,7 @@ struct nfqnl_instance {
* Following fields are dirtied for each queued packet,
* keep them in same cache line if possible.
*/
- spinlock_t lock;
+ spinlock_t lock ____cacheline_aligned_in_smp;
unsigned int queue_total;
unsigned int id_sequence; /* 'sequence' of pkt ids */
struct list_head queue_list; /* packets in queue */
@@ -77,7 +77,7 @@ struct nfqnl_instance {
typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
-static int nfnl_queue_net_id __read_mostly;
+static unsigned int nfnl_queue_net_id __read_mostly;
#define INSTANCE_BUCKETS 16
struct nfnl_queue_net {
@@ -384,7 +384,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
+ nla_total_size(sizeof(u_int32_t)) /* skbinfo */
+ nla_total_size(sizeof(u_int32_t)); /* cap_len */
- if (entskb->tstamp.tv64)
+ if (entskb->tstamp)
size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
size += nfqnl_get_bridge_size(entry);
@@ -443,7 +443,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
skb = alloc_skb(size, GFP_ATOMIC);
if (!skb) {
skb_tx_error(entskb);
- return NULL;
+ goto nlmsg_failure;
}
nlh = nlmsg_put(skb, 0, 0,
@@ -452,7 +452,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (!nlh) {
skb_tx_error(entskb);
kfree_skb(skb);
- return NULL;
+ goto nlmsg_failure;
}
nfmsg = nlmsg_data(nlh);
nfmsg->nfgen_family = entry->state.pf;
@@ -555,7 +555,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (nfqnl_put_bridge(entry, skb) < 0)
goto nla_put_failure;
- if (entskb->tstamp.tv64) {
+ if (entskb->tstamp) {
struct nfqnl_msg_packet_timestamp ts;
struct timespec64 kts = ktime_to_timespec64(entskb->tstamp);
@@ -598,12 +598,17 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
}
nlh->nlmsg_len = skb->len;
+ if (seclen)
+ security_release_secctx(secdata, seclen);
return skb;
nla_put_failure:
skb_tx_error(entskb);
kfree_skb(skb);
net_err_ratelimited("nf_queue: error creating packet message\n");
+nlmsg_failure:
+ if (seclen)
+ security_release_secctx(secdata, seclen);
return NULL;
}
@@ -919,7 +924,7 @@ static struct notifier_block nfqnl_dev_notifier = {
static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long entry_ptr)
{
- return rcu_access_pointer(entry->state.hook_entries) ==
+ return rcu_access_pointer(entry->hook) ==
(struct nf_hook_entry *)entry_ptr;
}
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 31c15ed2e5fc..877d9acd91ef 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -121,7 +121,6 @@ nla_put_failure:
return -1;
}
-static struct nft_expr_type nft_bitwise_type;
static const struct nft_expr_ops nft_bitwise_ops = {
.type = &nft_bitwise_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise)),
@@ -130,20 +129,10 @@ static const struct nft_expr_ops nft_bitwise_ops = {
.dump = nft_bitwise_dump,
};
-static struct nft_expr_type nft_bitwise_type __read_mostly = {
+struct nft_expr_type nft_bitwise_type __read_mostly = {
.name = "bitwise",
.ops = &nft_bitwise_ops,
.policy = nft_bitwise_policy,
.maxattr = NFTA_BITWISE_MAX,
.owner = THIS_MODULE,
};
-
-int __init nft_bitwise_module_init(void)
-{
- return nft_register_expr(&nft_bitwise_type);
-}
-
-void nft_bitwise_module_exit(void)
-{
- nft_unregister_expr(&nft_bitwise_type);
-}
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index ee63d981268d..13d4e421a6b3 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -169,7 +169,6 @@ nla_put_failure:
return -1;
}
-static struct nft_expr_type nft_byteorder_type;
static const struct nft_expr_ops nft_byteorder_ops = {
.type = &nft_byteorder_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)),
@@ -178,20 +177,10 @@ static const struct nft_expr_ops nft_byteorder_ops = {
.dump = nft_byteorder_dump,
};
-static struct nft_expr_type nft_byteorder_type __read_mostly = {
+struct nft_expr_type nft_byteorder_type __read_mostly = {
.name = "byteorder",
.ops = &nft_byteorder_ops,
.policy = nft_byteorder_policy,
.maxattr = NFTA_BYTEORDER_MAX,
.owner = THIS_MODULE,
};
-
-int __init nft_byteorder_module_init(void)
-{
- return nft_register_expr(&nft_byteorder_type);
-}
-
-void nft_byteorder_module_exit(void)
-{
- nft_unregister_expr(&nft_byteorder_type);
-}
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 2e53739812b1..2b96effeadc1 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -84,9 +84,6 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
if (err < 0)
return err;
- if (desc.len > U8_MAX)
- return -ERANGE;
-
priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP]));
priv->len = desc.len;
return 0;
@@ -110,7 +107,6 @@ nla_put_failure:
return -1;
}
-static struct nft_expr_type nft_cmp_type;
static const struct nft_expr_ops nft_cmp_ops = {
.type = &nft_cmp_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)),
@@ -211,20 +207,10 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
return &nft_cmp_ops;
}
-static struct nft_expr_type nft_cmp_type __read_mostly = {
+struct nft_expr_type nft_cmp_type __read_mostly = {
.name = "cmp",
.select_ops = nft_cmp_select_ops,
.policy = nft_cmp_policy,
.maxattr = NFTA_CMP_MAX,
.owner = THIS_MODULE,
};
-
-int __init nft_cmp_module_init(void)
-{
- return nft_register_expr(&nft_cmp_type);
-}
-
-void nft_cmp_module_exit(void)
-{
- nft_unregister_expr(&nft_cmp_type);
-}
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index 77db8358ab14..7f8422213341 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -18,105 +18,197 @@
#include <net/netfilter/nf_tables.h>
struct nft_counter {
- u64 bytes;
- u64 packets;
-};
-
-struct nft_counter_percpu {
- struct nft_counter counter;
- struct u64_stats_sync syncp;
+ s64 bytes;
+ s64 packets;
};
struct nft_counter_percpu_priv {
- struct nft_counter_percpu __percpu *counter;
+ struct nft_counter __percpu *counter;
};
-static void nft_counter_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static DEFINE_PER_CPU(seqcount_t, nft_counter_seq);
+
+static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
- struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
- struct nft_counter_percpu *this_cpu;
+ struct nft_counter *this_cpu;
+ seqcount_t *myseq;
local_bh_disable();
this_cpu = this_cpu_ptr(priv->counter);
- u64_stats_update_begin(&this_cpu->syncp);
- this_cpu->counter.bytes += pkt->skb->len;
- this_cpu->counter.packets++;
- u64_stats_update_end(&this_cpu->syncp);
+ myseq = this_cpu_ptr(&nft_counter_seq);
+
+ write_seqcount_begin(myseq);
+
+ this_cpu->bytes += pkt->skb->len;
+ this_cpu->packets++;
+
+ write_seqcount_end(myseq);
local_bh_enable();
}
-static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter,
+static inline void nft_counter_obj_eval(struct nft_object *obj,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
+
+ nft_counter_do_eval(priv, regs, pkt);
+}
+
+static int nft_counter_do_init(const struct nlattr * const tb[],
+ struct nft_counter_percpu_priv *priv)
+{
+ struct nft_counter __percpu *cpu_stats;
+ struct nft_counter *this_cpu;
+
+ cpu_stats = alloc_percpu(struct nft_counter);
+ if (cpu_stats == NULL)
+ return -ENOMEM;
+
+ preempt_disable();
+ this_cpu = this_cpu_ptr(cpu_stats);
+ if (tb[NFTA_COUNTER_PACKETS]) {
+ this_cpu->packets =
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+ }
+ if (tb[NFTA_COUNTER_BYTES]) {
+ this_cpu->bytes =
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+ }
+ preempt_enable();
+ priv->counter = cpu_stats;
+ return 0;
+}
+
+static int nft_counter_obj_init(const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+ struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
+
+ return nft_counter_do_init(tb, priv);
+}
+
+static void nft_counter_do_destroy(struct nft_counter_percpu_priv *priv)
+{
+ free_percpu(priv->counter);
+}
+
+static void nft_counter_obj_destroy(struct nft_object *obj)
+{
+ struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
+
+ nft_counter_do_destroy(priv);
+}
+
+static void nft_counter_reset(struct nft_counter_percpu_priv __percpu *priv,
struct nft_counter *total)
{
- const struct nft_counter_percpu *cpu_stats;
+ struct nft_counter *this_cpu;
+
+ local_bh_disable();
+ this_cpu = this_cpu_ptr(priv->counter);
+ this_cpu->packets -= total->packets;
+ this_cpu->bytes -= total->bytes;
+ local_bh_enable();
+}
+
+static void nft_counter_fetch(struct nft_counter_percpu_priv *priv,
+ struct nft_counter *total)
+{
+ struct nft_counter *this_cpu;
+ const seqcount_t *myseq;
u64 bytes, packets;
unsigned int seq;
int cpu;
memset(total, 0, sizeof(*total));
for_each_possible_cpu(cpu) {
- cpu_stats = per_cpu_ptr(counter, cpu);
+ myseq = per_cpu_ptr(&nft_counter_seq, cpu);
+ this_cpu = per_cpu_ptr(priv->counter, cpu);
do {
- seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
- bytes = cpu_stats->counter.bytes;
- packets = cpu_stats->counter.packets;
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq));
+ seq = read_seqcount_begin(myseq);
+ bytes = this_cpu->bytes;
+ packets = this_cpu->packets;
+ } while (read_seqcount_retry(myseq, seq));
- total->packets += packets;
- total->bytes += bytes;
+ total->bytes += bytes;
+ total->packets += packets;
}
}
-static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_counter_do_dump(struct sk_buff *skb,
+ struct nft_counter_percpu_priv *priv,
+ bool reset)
{
- struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
struct nft_counter total;
- nft_counter_fetch(priv->counter, &total);
+ nft_counter_fetch(priv, &total);
if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes),
NFTA_COUNTER_PAD) ||
nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets),
NFTA_COUNTER_PAD))
goto nla_put_failure;
+
+ if (reset)
+ nft_counter_reset(priv, &total);
+
return 0;
nla_put_failure:
return -1;
}
+static int nft_counter_obj_dump(struct sk_buff *skb,
+ struct nft_object *obj, bool reset)
+{
+ struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
+
+ return nft_counter_do_dump(skb, priv, reset);
+}
+
static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
[NFTA_COUNTER_PACKETS] = { .type = NLA_U64 },
[NFTA_COUNTER_BYTES] = { .type = NLA_U64 },
};
+static struct nft_object_type nft_counter_obj __read_mostly = {
+ .type = NFT_OBJECT_COUNTER,
+ .size = sizeof(struct nft_counter_percpu_priv),
+ .maxattr = NFTA_COUNTER_MAX,
+ .policy = nft_counter_policy,
+ .eval = nft_counter_obj_eval,
+ .init = nft_counter_obj_init,
+ .destroy = nft_counter_obj_destroy,
+ .dump = nft_counter_obj_dump,
+ .owner = THIS_MODULE,
+};
+
+static void nft_counter_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+
+ nft_counter_do_eval(priv, regs, pkt);
+}
+
+static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+
+ return nft_counter_do_dump(skb, priv, false);
+}
+
static int nft_counter_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
- struct nft_counter_percpu __percpu *cpu_stats;
- struct nft_counter_percpu *this_cpu;
-
- cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu);
- if (cpu_stats == NULL)
- return -ENOMEM;
- preempt_disable();
- this_cpu = this_cpu_ptr(cpu_stats);
- if (tb[NFTA_COUNTER_PACKETS]) {
- this_cpu->counter.packets =
- be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
- }
- if (tb[NFTA_COUNTER_BYTES]) {
- this_cpu->counter.bytes =
- be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
- }
- preempt_enable();
- priv->counter = cpu_stats;
- return 0;
+ return nft_counter_do_init(tb, priv);
}
static void nft_counter_destroy(const struct nft_ctx *ctx,
@@ -124,28 +216,27 @@ static void nft_counter_destroy(const struct nft_ctx *ctx,
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
- free_percpu(priv->counter);
+ nft_counter_do_destroy(priv);
}
static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(src);
struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst);
- struct nft_counter_percpu __percpu *cpu_stats;
- struct nft_counter_percpu *this_cpu;
+ struct nft_counter __percpu *cpu_stats;
+ struct nft_counter *this_cpu;
struct nft_counter total;
- nft_counter_fetch(priv->counter, &total);
+ nft_counter_fetch(priv, &total);
- cpu_stats = __netdev_alloc_pcpu_stats(struct nft_counter_percpu,
- GFP_ATOMIC);
+ cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC);
if (cpu_stats == NULL)
return -ENOMEM;
preempt_disable();
this_cpu = this_cpu_ptr(cpu_stats);
- this_cpu->counter.packets = total.packets;
- this_cpu->counter.bytes = total.bytes;
+ this_cpu->packets = total.packets;
+ this_cpu->bytes = total.bytes;
preempt_enable();
priv_clone->counter = cpu_stats;
@@ -174,12 +265,29 @@ static struct nft_expr_type nft_counter_type __read_mostly = {
static int __init nft_counter_module_init(void)
{
- return nft_register_expr(&nft_counter_type);
+ int cpu, err;
+
+ for_each_possible_cpu(cpu)
+ seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu));
+
+ err = nft_register_obj(&nft_counter_obj);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_counter_type);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+err1:
+ nft_unregister_obj(&nft_counter_obj);
+ return err;
}
static void __exit nft_counter_module_exit(void)
{
nft_unregister_expr(&nft_counter_type);
+ nft_unregister_obj(&nft_counter_obj);
}
module_init(nft_counter_module_init);
@@ -188,3 +296,4 @@ module_exit(nft_counter_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_EXPR("counter");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_COUNTER);
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index d7b0d171172a..0264258c46fe 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -31,6 +32,11 @@ struct nft_ct {
};
};
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template);
+static unsigned int nft_ct_pcpu_template_refcnt __read_mostly;
+#endif
+
static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c,
enum nft_ct_keys k,
enum ip_conntrack_dir d)
@@ -77,7 +83,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
switch (priv->key) {
case NFT_CT_DIRECTION:
- *dest = CTINFO2DIR(ctinfo);
+ nft_reg_store8(dest, CTINFO2DIR(ctinfo));
return;
case NFT_CT_STATUS:
*dest = ct->status;
@@ -128,12 +134,42 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
memcpy(dest, &count, sizeof(count));
return;
}
+ case NFT_CT_AVGPKT: {
+ const struct nf_conn_acct *acct = nf_conn_acct_find(ct);
+ u64 avgcnt = 0, bcnt = 0, pcnt = 0;
+
+ if (acct) {
+ pcnt = nft_ct_get_eval_counter(acct->counter,
+ NFT_CT_PKTS, priv->dir);
+ bcnt = nft_ct_get_eval_counter(acct->counter,
+ NFT_CT_BYTES, priv->dir);
+ if (pcnt != 0)
+ avgcnt = div64_u64(bcnt, pcnt);
+ }
+
+ memcpy(dest, &avgcnt, sizeof(avgcnt));
+ return;
+ }
case NFT_CT_L3PROTOCOL:
- *dest = nf_ct_l3num(ct);
+ nft_reg_store8(dest, nf_ct_l3num(ct));
return;
case NFT_CT_PROTOCOL:
- *dest = nf_ct_protonum(ct);
+ nft_reg_store8(dest, nf_ct_protonum(ct));
+ return;
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ case NFT_CT_ZONE: {
+ const struct nf_conntrack_zone *zone = nf_ct_zone(ct);
+ u16 zoneid;
+
+ if (priv->dir < IP_CT_DIR_MAX)
+ zoneid = nf_ct_zone_id(zone, priv->dir);
+ else
+ zoneid = zone->id;
+
+ nft_reg_store16(dest, zoneid);
return;
+ }
+#endif
default:
break;
}
@@ -149,10 +185,10 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
return;
case NFT_CT_PROTO_SRC:
- *dest = (__force __u16)tuple->src.u.all;
+ nft_reg_store16(dest, (__force u16)tuple->src.u.all);
return;
case NFT_CT_PROTO_DST:
- *dest = (__force __u16)tuple->dst.u.all;
+ nft_reg_store16(dest, (__force u16)tuple->dst.u.all);
return;
default:
break;
@@ -162,6 +198,53 @@ err:
regs->verdict.code = NFT_BREAK;
}
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static void nft_ct_set_zone_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nf_conntrack_zone zone = { .dir = NF_CT_DEFAULT_ZONE_DIR };
+ const struct nft_ct *priv = nft_expr_priv(expr);
+ struct sk_buff *skb = pkt->skb;
+ enum ip_conntrack_info ctinfo;
+ u16 value = nft_reg_load16(&regs->data[priv->sreg]);
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) /* already tracked */
+ return;
+
+ zone.id = value;
+
+ switch (priv->dir) {
+ case IP_CT_DIR_ORIGINAL:
+ zone.dir = NF_CT_ZONE_DIR_ORIG;
+ break;
+ case IP_CT_DIR_REPLY:
+ zone.dir = NF_CT_ZONE_DIR_REPL;
+ break;
+ default:
+ break;
+ }
+
+ ct = this_cpu_read(nft_ct_pcpu_template);
+
+ if (likely(atomic_read(&ct->ct_general.use) == 1)) {
+ nf_ct_zone_add(ct, &zone);
+ } else {
+ /* previous skb got queued to userspace */
+ ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC);
+ if (!ct) {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
+ }
+
+ atomic_inc(&ct->ct_general.use);
+ nf_ct_set(skb, ct, IP_CT_NEW);
+}
+#endif
+
static void nft_ct_set_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -207,39 +290,78 @@ static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = {
[NFTA_CT_SREG] = { .type = NLA_U32 },
};
-static int nft_ct_l3proto_try_module_get(uint8_t family)
+static int nft_ct_netns_get(struct net *net, uint8_t family)
{
int err;
if (family == NFPROTO_INET) {
- err = nf_ct_l3proto_try_module_get(NFPROTO_IPV4);
+ err = nf_ct_netns_get(net, NFPROTO_IPV4);
if (err < 0)
goto err1;
- err = nf_ct_l3proto_try_module_get(NFPROTO_IPV6);
+ err = nf_ct_netns_get(net, NFPROTO_IPV6);
if (err < 0)
goto err2;
} else {
- err = nf_ct_l3proto_try_module_get(family);
+ err = nf_ct_netns_get(net, family);
if (err < 0)
goto err1;
}
return 0;
err2:
- nf_ct_l3proto_module_put(NFPROTO_IPV4);
+ nf_ct_netns_put(net, NFPROTO_IPV4);
err1:
return err;
}
-static void nft_ct_l3proto_module_put(uint8_t family)
+static void nft_ct_netns_put(struct net *net, uint8_t family)
{
if (family == NFPROTO_INET) {
- nf_ct_l3proto_module_put(NFPROTO_IPV4);
- nf_ct_l3proto_module_put(NFPROTO_IPV6);
+ nf_ct_netns_put(net, NFPROTO_IPV4);
+ nf_ct_netns_put(net, NFPROTO_IPV6);
} else
- nf_ct_l3proto_module_put(family);
+ nf_ct_netns_put(net, family);
+}
+
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static void nft_ct_tmpl_put_pcpu(void)
+{
+ struct nf_conn *ct;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ ct = per_cpu(nft_ct_pcpu_template, cpu);
+ if (!ct)
+ break;
+ nf_ct_put(ct);
+ per_cpu(nft_ct_pcpu_template, cpu) = NULL;
+ }
}
+static bool nft_ct_tmpl_alloc_pcpu(void)
+{
+ struct nf_conntrack_zone zone = { .id = 0 };
+ struct nf_conn *tmp;
+ int cpu;
+
+ if (nft_ct_pcpu_template_refcnt)
+ return true;
+
+ for_each_possible_cpu(cpu) {
+ tmp = nf_ct_tmpl_alloc(&init_net, &zone, GFP_KERNEL);
+ if (!tmp) {
+ nft_ct_tmpl_put_pcpu();
+ return false;
+ }
+
+ atomic_set(&tmp->ct_general.use, 1);
+ per_cpu(nft_ct_pcpu_template, cpu) = tmp;
+ }
+
+ return true;
+}
+#endif
+
static int nft_ct_get_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -249,6 +371,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
int err;
priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
+ priv->dir = IP_CT_DIR_MAX;
switch (priv->key) {
case NFT_CT_DIRECTION:
if (tb[NFTA_CT_DIRECTION] != NULL)
@@ -315,11 +438,14 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
break;
case NFT_CT_BYTES:
case NFT_CT_PKTS:
- /* no direction? return sum of original + reply */
- if (tb[NFTA_CT_DIRECTION] == NULL)
- priv->dir = IP_CT_DIR_MAX;
+ case NFT_CT_AVGPKT:
len = sizeof(u64);
break;
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ case NFT_CT_ZONE:
+ len = sizeof(u16);
+ break;
+#endif
default:
return -EOPNOTSUPP;
}
@@ -341,25 +467,45 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
- err = nft_ct_l3proto_try_module_get(ctx->afi->family);
+ err = nft_ct_netns_get(ctx->net, ctx->afi->family);
if (err < 0)
return err;
- if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS)
+ if (priv->key == NFT_CT_BYTES ||
+ priv->key == NFT_CT_PKTS ||
+ priv->key == NFT_CT_AVGPKT)
nf_ct_set_acct(ctx->net, true);
return 0;
}
+static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv)
+{
+ switch (priv->key) {
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ case NFT_CT_LABELS:
+ nf_connlabels_put(ctx->net);
+ break;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ case NFT_CT_ZONE:
+ if (--nft_ct_pcpu_template_refcnt == 0)
+ nft_ct_tmpl_put_pcpu();
+#endif
+ default:
+ break;
+ }
+}
+
static int nft_ct_set_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_ct *priv = nft_expr_priv(expr);
- bool label_got = false;
unsigned int len;
int err;
+ priv->dir = IP_CT_DIR_MAX;
priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
switch (priv->key) {
#ifdef CONFIG_NF_CONNTRACK_MARK
@@ -377,34 +523,52 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1);
if (err)
return err;
- label_got = true;
+ break;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ case NFT_CT_ZONE:
+ if (!nft_ct_tmpl_alloc_pcpu())
+ return -ENOMEM;
+ nft_ct_pcpu_template_refcnt++;
+ len = sizeof(u16);
break;
#endif
default:
return -EOPNOTSUPP;
}
+ if (tb[NFTA_CT_DIRECTION]) {
+ priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]);
+ switch (priv->dir) {
+ case IP_CT_DIR_ORIGINAL:
+ case IP_CT_DIR_REPLY:
+ break;
+ default:
+ err = -EINVAL;
+ goto err1;
+ }
+ }
+
priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]);
err = nft_validate_register_load(priv->sreg, len);
if (err < 0)
goto err1;
- err = nft_ct_l3proto_try_module_get(ctx->afi->family);
+ err = nft_ct_netns_get(ctx->net, ctx->afi->family);
if (err < 0)
goto err1;
return 0;
err1:
- if (label_got)
- nf_connlabels_put(ctx->net);
+ __nft_ct_set_destroy(ctx, priv);
return err;
}
static void nft_ct_get_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
- nft_ct_l3proto_module_put(ctx->afi->family);
+ nf_ct_netns_put(ctx->net, ctx->afi->family);
}
static void nft_ct_set_destroy(const struct nft_ctx *ctx,
@@ -412,17 +576,8 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx,
{
struct nft_ct *priv = nft_expr_priv(expr);
- switch (priv->key) {
-#ifdef CONFIG_NF_CONNTRACK_LABELS
- case NFT_CT_LABELS:
- nf_connlabels_put(ctx->net);
- break;
-#endif
- default:
- break;
- }
-
- nft_ct_l3proto_module_put(ctx->afi->family);
+ __nft_ct_set_destroy(ctx, priv);
+ nft_ct_netns_put(ctx->net, ctx->afi->family);
}
static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -444,6 +599,8 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
break;
case NFT_CT_BYTES:
case NFT_CT_PKTS:
+ case NFT_CT_AVGPKT:
+ case NFT_CT_ZONE:
if (priv->dir < IP_CT_DIR_MAX &&
nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
goto nla_put_failure;
@@ -466,6 +623,17 @@ static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key)))
goto nla_put_failure;
+
+ switch (priv->key) {
+ case NFT_CT_ZONE:
+ if (priv->dir < IP_CT_DIR_MAX &&
+ nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
+ goto nla_put_failure;
+ break;
+ default:
+ break;
+ }
+
return 0;
nla_put_failure:
@@ -491,6 +659,17 @@ static const struct nft_expr_ops nft_ct_set_ops = {
.dump = nft_ct_set_dump,
};
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static const struct nft_expr_ops nft_ct_set_zone_ops = {
+ .type = &nft_ct_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
+ .eval = nft_ct_set_zone_eval,
+ .init = nft_ct_set_init,
+ .destroy = nft_ct_set_destroy,
+ .dump = nft_ct_set_dump,
+};
+#endif
+
static const struct nft_expr_ops *
nft_ct_select_ops(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
@@ -504,8 +683,13 @@ nft_ct_select_ops(const struct nft_ctx *ctx,
if (tb[NFTA_CT_DREG])
return &nft_ct_get_ops;
- if (tb[NFTA_CT_SREG])
+ if (tb[NFTA_CT_SREG]) {
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ if (nla_get_be32(tb[NFTA_CT_KEY]) == htonl(NFT_CT_ZONE))
+ return &nft_ct_set_zone_ops;
+#endif
return &nft_ct_set_ops;
+ }
return ERR_PTR(-EINVAL);
}
@@ -518,15 +702,60 @@ static struct nft_expr_type nft_ct_type __read_mostly = {
.owner = THIS_MODULE,
};
+static void nft_notrack_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct sk_buff *skb = pkt->skb;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(pkt->skb, &ctinfo);
+ /* Previously seen (loopback or untracked)? Ignore. */
+ if (ct)
+ return;
+
+ ct = nf_ct_untracked_get();
+ atomic_inc(&ct->ct_general.use);
+ nf_ct_set(skb, ct, IP_CT_NEW);
+}
+
+static struct nft_expr_type nft_notrack_type;
+static const struct nft_expr_ops nft_notrack_ops = {
+ .type = &nft_notrack_type,
+ .size = NFT_EXPR_SIZE(0),
+ .eval = nft_notrack_eval,
+};
+
+static struct nft_expr_type nft_notrack_type __read_mostly = {
+ .name = "notrack",
+ .ops = &nft_notrack_ops,
+ .owner = THIS_MODULE,
+};
+
static int __init nft_ct_module_init(void)
{
+ int err;
+
BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > NFT_REG_SIZE);
- return nft_register_expr(&nft_ct_type);
+ err = nft_register_expr(&nft_ct_type);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_notrack_type);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+err1:
+ nft_unregister_expr(&nft_ct_type);
+ return err;
}
static void __exit nft_ct_module_exit(void)
{
+ nft_unregister_expr(&nft_notrack_type);
nft_unregister_expr(&nft_ct_type);
}
@@ -536,3 +765,4 @@ module_exit(nft_ct_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_EXPR("ct");
+MODULE_ALIAS_NFT_EXPR("notrack");
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 31ca94793aa9..049ad2d9ee66 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -98,7 +98,8 @@ out:
}
static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
- [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING },
+ [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING,
+ .len = NFT_SET_MAXNAMELEN - 1 },
[NFTA_DYNSET_SET_ID] = { .type = NLA_U32 },
[NFTA_DYNSET_OP] = { .type = NLA_U32 },
[NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 },
@@ -268,7 +269,6 @@ nla_put_failure:
return -1;
}
-static struct nft_expr_type nft_dynset_type;
static const struct nft_expr_ops nft_dynset_ops = {
.type = &nft_dynset_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_dynset)),
@@ -278,20 +278,10 @@ static const struct nft_expr_ops nft_dynset_ops = {
.dump = nft_dynset_dump,
};
-static struct nft_expr_type nft_dynset_type __read_mostly = {
+struct nft_expr_type nft_dynset_type __read_mostly = {
.name = "dynset",
.ops = &nft_dynset_ops,
.policy = nft_dynset_policy,
.maxattr = NFTA_DYNSET_MAX,
.owner = THIS_MODULE,
};
-
-int __init nft_dynset_module_init(void)
-{
- return nft_register_expr(&nft_dynset_type);
-}
-
-void nft_dynset_module_exit(void)
-{
- nft_unregister_expr(&nft_dynset_type);
-}
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 47beb3abcc9d..c308920b194c 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -15,19 +15,29 @@
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
-// FIXME:
-#include <net/ipv6.h>
+#include <net/tcp.h>
struct nft_exthdr {
u8 type;
u8 offset;
u8 len;
+ u8 op;
enum nft_registers dreg:8;
+ u8 flags;
};
-static void nft_exthdr_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static unsigned int optlen(const u8 *opt, unsigned int offset)
+{
+ /* Beware zero-length options: make finite progress */
+ if (opt[offset] <= TCPOPT_NOP || opt[offset + 1] == 0)
+ return 1;
+ else
+ return opt[offset + 1];
+}
+
+static void nft_exthdr_ipv6_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
struct nft_exthdr *priv = nft_expr_priv(expr);
u32 *dest = &regs->data[priv->dreg];
@@ -35,8 +45,12 @@ static void nft_exthdr_eval(const struct nft_expr *expr,
int err;
err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL);
- if (err < 0)
+ if (priv->flags & NFT_EXTHDR_F_PRESENT) {
+ *dest = (err >= 0);
+ return;
+ } else if (err < 0) {
goto err;
+ }
offset += priv->offset;
dest[priv->len / NFT_REG32_SIZE] = 0;
@@ -47,11 +61,59 @@ err:
regs->verdict.code = NFT_BREAK;
}
+static void nft_exthdr_tcp_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE];
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ unsigned int i, optl, tcphdr_len, offset;
+ u32 *dest = &regs->data[priv->dreg];
+ struct tcphdr *tcph;
+ u8 *opt;
+
+ if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP)
+ goto err;
+
+ tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buff);
+ if (!tcph)
+ goto err;
+
+ tcphdr_len = __tcp_hdrlen(tcph);
+ if (tcphdr_len < sizeof(*tcph))
+ goto err;
+
+ tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, tcphdr_len, buff);
+ if (!tcph)
+ goto err;
+
+ opt = (u8 *)tcph;
+ for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
+ optl = optlen(opt, i);
+
+ if (priv->type != opt[i])
+ continue;
+
+ if (i + optl > tcphdr_len || priv->len + priv->offset > optl)
+ goto err;
+
+ offset = i + priv->offset;
+ dest[priv->len / NFT_REG32_SIZE] = 0;
+ memcpy(dest, opt + offset, priv->len);
+
+ return;
+ }
+
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
[NFTA_EXTHDR_DREG] = { .type = NLA_U32 },
[NFTA_EXTHDR_TYPE] = { .type = NLA_U8 },
[NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 },
[NFTA_EXTHDR_LEN] = { .type = NLA_U32 },
+ [NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 },
};
static int nft_exthdr_init(const struct nft_ctx *ctx,
@@ -59,13 +121,13 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_exthdr *priv = nft_expr_priv(expr);
- u32 offset, len;
+ u32 offset, len, flags = 0, op = NFT_EXTHDR_OP_IPV6;
int err;
- if (tb[NFTA_EXTHDR_DREG] == NULL ||
- tb[NFTA_EXTHDR_TYPE] == NULL ||
- tb[NFTA_EXTHDR_OFFSET] == NULL ||
- tb[NFTA_EXTHDR_LEN] == NULL)
+ if (!tb[NFTA_EXTHDR_DREG] ||
+ !tb[NFTA_EXTHDR_TYPE] ||
+ !tb[NFTA_EXTHDR_OFFSET] ||
+ !tb[NFTA_EXTHDR_LEN])
return -EINVAL;
err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset);
@@ -76,10 +138,27 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
+ if (tb[NFTA_EXTHDR_FLAGS]) {
+ err = nft_parse_u32_check(tb[NFTA_EXTHDR_FLAGS], U8_MAX, &flags);
+ if (err < 0)
+ return err;
+
+ if (flags & ~NFT_EXTHDR_F_PRESENT)
+ return -EINVAL;
+ }
+
+ if (tb[NFTA_EXTHDR_OP]) {
+ err = nft_parse_u32_check(tb[NFTA_EXTHDR_OP], U8_MAX, &op);
+ if (err < 0)
+ return err;
+ }
+
priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
priv->offset = offset;
priv->len = len;
priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]);
+ priv->flags = flags;
+ priv->op = op;
return nft_validate_register_store(ctx, priv->dreg, NULL,
NFT_DATA_VALUE, priv->len);
@@ -97,6 +176,10 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len)))
goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_EXTHDR_FLAGS, htonl(priv->flags)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_EXTHDR_OP, htonl(priv->op)))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -104,17 +187,45 @@ nla_put_failure:
}
static struct nft_expr_type nft_exthdr_type;
-static const struct nft_expr_ops nft_exthdr_ops = {
+static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
.type = &nft_exthdr_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
- .eval = nft_exthdr_eval,
+ .eval = nft_exthdr_ipv6_eval,
.init = nft_exthdr_init,
.dump = nft_exthdr_dump,
};
+static const struct nft_expr_ops nft_exthdr_tcp_ops = {
+ .type = &nft_exthdr_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+ .eval = nft_exthdr_tcp_eval,
+ .init = nft_exthdr_init,
+ .dump = nft_exthdr_dump,
+};
+
+static const struct nft_expr_ops *
+nft_exthdr_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ u32 op;
+
+ if (!tb[NFTA_EXTHDR_OP])
+ return &nft_exthdr_ipv6_ops;
+
+ op = ntohl(nla_get_u32(tb[NFTA_EXTHDR_OP]));
+ switch (op) {
+ case NFT_EXTHDR_OP_TCPOPT:
+ return &nft_exthdr_tcp_ops;
+ case NFT_EXTHDR_OP_IPV6:
+ return &nft_exthdr_ipv6_ops;
+ }
+
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
static struct nft_expr_type nft_exthdr_type __read_mostly = {
.name = "exthdr",
- .ops = &nft_exthdr_ops,
+ .select_ops = &nft_exthdr_select_ops,
.policy = nft_exthdr_policy,
.maxattr = NFTA_EXTHDR_MAX,
.owner = THIS_MODULE,
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
new file mode 100644
index 000000000000..29a4906adc27
--- /dev/null
+++ b/net/netfilter/nft_fib.c
@@ -0,0 +1,159 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Generic part shared by ipv4 and ipv6 backends.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_fib.h>
+
+const struct nla_policy nft_fib_policy[NFTA_FIB_MAX + 1] = {
+ [NFTA_FIB_DREG] = { .type = NLA_U32 },
+ [NFTA_FIB_RESULT] = { .type = NLA_U32 },
+ [NFTA_FIB_FLAGS] = { .type = NLA_U32 },
+};
+EXPORT_SYMBOL(nft_fib_policy);
+
+#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \
+ NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)
+
+int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ unsigned int hooks;
+
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF: /* fallthrough */
+ case NFT_FIB_RESULT_OIFNAME:
+ hooks = (1 << NF_INET_PRE_ROUTING);
+ break;
+ case NFT_FIB_RESULT_ADDRTYPE:
+ if (priv->flags & NFTA_FIB_F_IIF)
+ hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD);
+ else if (priv->flags & NFTA_FIB_F_OIF)
+ hooks = (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_FORWARD);
+ else
+ hooks = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING);
+
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+EXPORT_SYMBOL_GPL(nft_fib_validate);
+
+int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_fib *priv = nft_expr_priv(expr);
+ unsigned int len;
+ int err;
+
+ if (!tb[NFTA_FIB_DREG] || !tb[NFTA_FIB_RESULT] || !tb[NFTA_FIB_FLAGS])
+ return -EINVAL;
+
+ priv->flags = ntohl(nla_get_be32(tb[NFTA_FIB_FLAGS]));
+
+ if (priv->flags == 0 || (priv->flags & ~NFTA_FIB_F_ALL))
+ return -EINVAL;
+
+ if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) ==
+ (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR))
+ return -EINVAL;
+ if ((priv->flags & (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)) ==
+ (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF))
+ return -EINVAL;
+ if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) == 0)
+ return -EINVAL;
+
+ priv->result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
+ priv->dreg = nft_parse_register(tb[NFTA_FIB_DREG]);
+
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF:
+ if (priv->flags & NFTA_FIB_F_OIF)
+ return -EINVAL;
+ len = sizeof(int);
+ break;
+ case NFT_FIB_RESULT_OIFNAME:
+ if (priv->flags & NFTA_FIB_F_OIF)
+ return -EINVAL;
+ len = IFNAMSIZ;
+ break;
+ case NFT_FIB_RESULT_ADDRTYPE:
+ len = sizeof(u32);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ err = nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, len);
+ if (err < 0)
+ return err;
+
+ return nft_fib_validate(ctx, expr, NULL);
+}
+EXPORT_SYMBOL_GPL(nft_fib_init);
+
+int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_FIB_DREG, priv->dreg))
+ return -1;
+
+ if (nla_put_be32(skb, NFTA_FIB_RESULT, htonl(priv->result)))
+ return -1;
+
+ if (nla_put_be32(skb, NFTA_FIB_FLAGS, htonl(priv->flags)))
+ return -1;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_fib_dump);
+
+void nft_fib_store_result(void *reg, enum nft_fib_result r,
+ const struct nft_pktinfo *pkt, int index)
+{
+ struct net_device *dev;
+ u32 *dreg = reg;
+
+ switch (r) {
+ case NFT_FIB_RESULT_OIF:
+ *dreg = index;
+ break;
+ case NFT_FIB_RESULT_OIFNAME:
+ dev = dev_get_by_index_rcu(nft_net(pkt), index);
+ strncpy(reg, dev ? dev->name : "", IFNAMSIZ);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ *dreg = 0;
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(nft_fib_store_result);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
diff --git a/net/netfilter/nft_fib_inet.c b/net/netfilter/nft_fib_inet.c
new file mode 100644
index 000000000000..9120fc7228f4
--- /dev/null
+++ b/net/netfilter/nft_fib_inet.c
@@ -0,0 +1,82 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+#include <net/netfilter/nft_fib.h>
+
+static void nft_fib_inet_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF:
+ case NFT_FIB_RESULT_OIFNAME:
+ return nft_fib4_eval(expr, regs, pkt);
+ case NFT_FIB_RESULT_ADDRTYPE:
+ return nft_fib4_eval_type(expr, regs, pkt);
+ }
+ break;
+ case NFPROTO_IPV6:
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF:
+ case NFT_FIB_RESULT_OIFNAME:
+ return nft_fib6_eval(expr, regs, pkt);
+ case NFT_FIB_RESULT_ADDRTYPE:
+ return nft_fib6_eval_type(expr, regs, pkt);
+ }
+ break;
+ }
+
+ regs->verdict.code = NF_DROP;
+}
+
+static struct nft_expr_type nft_fib_inet_type;
+static const struct nft_expr_ops nft_fib_inet_ops = {
+ .type = &nft_fib_inet_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+ .eval = nft_fib_inet_eval,
+ .init = nft_fib_init,
+ .dump = nft_fib_dump,
+ .validate = nft_fib_validate,
+};
+
+static struct nft_expr_type nft_fib_inet_type __read_mostly = {
+ .family = NFPROTO_INET,
+ .name = "fib",
+ .ops = &nft_fib_inet_ops,
+ .policy = nft_fib_policy,
+ .maxattr = NFTA_FIB_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_fib_inet_module_init(void)
+{
+ return nft_register_expr(&nft_fib_inet_type);
+}
+
+static void __exit nft_fib_inet_module_exit(void)
+{
+ nft_unregister_expr(&nft_fib_inet_type);
+}
+
+module_init(nft_fib_inet_module_init);
+module_exit(nft_fib_inet_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_ALIAS_NFT_AF_EXPR(1, "fib");
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 763ebc3e0b2b..ce13a50b9189 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -26,8 +26,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
struct nft_fwd_netdev *priv = nft_expr_priv(expr);
int oif = regs->data[priv->sreg_dev];
- nf_dup_netdev_egress(pkt, oif);
- regs->verdict.code = NF_DROP;
+ nf_fwd_netdev_egress(pkt, oif);
+ regs->verdict.code = NF_STOLEN;
}
static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = {
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index d5447a22275c..c4dad1254ead 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -21,6 +21,7 @@ struct nft_hash {
enum nft_registers sreg:8;
enum nft_registers dreg:8;
u8 len;
+ bool autogen_seed:1;
u32 modulus;
u32 seed;
u32 offset;
@@ -58,7 +59,6 @@ static int nft_hash_init(const struct nft_ctx *ctx,
if (!tb[NFTA_HASH_SREG] ||
!tb[NFTA_HASH_DREG] ||
!tb[NFTA_HASH_LEN] ||
- !tb[NFTA_HASH_SEED] ||
!tb[NFTA_HASH_MODULUS])
return -EINVAL;
@@ -83,7 +83,12 @@ static int nft_hash_init(const struct nft_ctx *ctx,
if (priv->offset + priv->modulus - 1 < priv->offset)
return -EOVERFLOW;
- priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED]));
+ if (tb[NFTA_HASH_SEED]) {
+ priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED]));
+ } else {
+ priv->autogen_seed = true;
+ get_random_bytes(&priv->seed, sizeof(priv->seed));
+ }
return nft_validate_register_load(priv->sreg, len) &&
nft_validate_register_store(ctx, priv->dreg, NULL,
@@ -103,7 +108,8 @@ static int nft_hash_dump(struct sk_buff *skb,
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_HASH_MODULUS, htonl(priv->modulus)))
goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed)))
+ if (!priv->autogen_seed &&
+ nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed)))
goto nla_put_failure;
if (priv->offset != 0)
if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset)))
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index d17018ff54e6..728baf88295a 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -54,9 +54,6 @@ static int nft_immediate_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
- if (desc.len > U8_MAX)
- return -ERANGE;
-
priv->dlen = desc.len;
priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]);
@@ -105,7 +102,6 @@ static int nft_immediate_validate(const struct nft_ctx *ctx,
return 0;
}
-static struct nft_expr_type nft_imm_type;
static const struct nft_expr_ops nft_imm_ops = {
.type = &nft_imm_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
@@ -116,20 +112,10 @@ static const struct nft_expr_ops nft_imm_ops = {
.validate = nft_immediate_validate,
};
-static struct nft_expr_type nft_imm_type __read_mostly = {
+struct nft_expr_type nft_imm_type __read_mostly = {
.name = "immediate",
.ops = &nft_imm_ops,
.policy = nft_immediate_policy,
.maxattr = NFTA_IMMEDIATE_MAX,
.owner = THIS_MODULE,
};
-
-int __init nft_immediate_module_init(void)
-{
- return nft_register_expr(&nft_imm_type);
-}
-
-void nft_immediate_module_exit(void)
-{
- nft_unregister_expr(&nft_imm_type);
-}
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 1b01404bb33f..6f6e64423643 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -32,13 +32,15 @@ static void nft_log_eval(const struct nft_expr *expr,
{
const struct nft_log *priv = nft_expr_priv(expr);
- nf_log_packet(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in,
- pkt->out, &priv->loginfo, "%s", priv->prefix);
+ nf_log_packet(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
+ nft_in(pkt), nft_out(pkt), &priv->loginfo, "%s",
+ priv->prefix);
}
static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = {
[NFTA_LOG_GROUP] = { .type = NLA_U16 },
- [NFTA_LOG_PREFIX] = { .type = NLA_STRING },
+ [NFTA_LOG_PREFIX] = { .type = NLA_STRING,
+ .len = NF_LOG_PREFIXLEN - 1 },
[NFTA_LOG_SNAPLEN] = { .type = NLA_U32 },
[NFTA_LOG_QTHRESHOLD] = { .type = NLA_U16 },
[NFTA_LOG_LEVEL] = { .type = NLA_U32 },
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 8166b6994cc7..e21aea7e5ec8 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -35,9 +35,8 @@ static void nft_lookup_eval(const struct nft_expr *expr,
const struct nft_set_ext *ext;
bool found;
- found = set->ops->lookup(pkt->net, set, &regs->data[priv->sreg], &ext) ^
- priv->invert;
-
+ found = set->ops->lookup(nft_net(pkt), set, &regs->data[priv->sreg],
+ &ext) ^ priv->invert;
if (!found) {
regs->verdict.code = NFT_BREAK;
return;
@@ -50,7 +49,8 @@ static void nft_lookup_eval(const struct nft_expr *expr,
}
static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = {
- [NFTA_LOOKUP_SET] = { .type = NLA_STRING },
+ [NFTA_LOOKUP_SET] = { .type = NLA_STRING,
+ .len = NFT_SET_MAXNAMELEN - 1 },
[NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 },
[NFTA_LOOKUP_SREG] = { .type = NLA_U32 },
[NFTA_LOOKUP_DREG] = { .type = NLA_U32 },
@@ -155,7 +155,6 @@ nla_put_failure:
return -1;
}
-static struct nft_expr_type nft_lookup_type;
static const struct nft_expr_ops nft_lookup_ops = {
.type = &nft_lookup_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
@@ -165,20 +164,10 @@ static const struct nft_expr_ops nft_lookup_ops = {
.dump = nft_lookup_dump,
};
-static struct nft_expr_type nft_lookup_type __read_mostly = {
+struct nft_expr_type nft_lookup_type __read_mostly = {
.name = "lookup",
.ops = &nft_lookup_ops,
.policy = nft_lookup_policy,
.maxattr = NFTA_LOOKUP_MAX,
.owner = THIS_MODULE,
};
-
-int __init nft_lookup_module_init(void)
-{
- return nft_register_expr(&nft_lookup_type);
-}
-
-void nft_lookup_module_exit(void)
-{
- nft_unregister_expr(&nft_lookup_type);
-}
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 81b5ad6165ac..11ce016cd479 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -77,7 +77,7 @@ int nft_masq_init(const struct nft_ctx *ctx,
}
}
- return 0;
+ return nf_ct_netns_get(ctx->net, ctx->afi->family);
}
EXPORT_SYMBOL_GPL(nft_masq_init);
@@ -105,4 +105,4 @@ nla_put_failure:
EXPORT_SYMBOL_GPL(nft_masq_dump);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 6c1e0246706e..7b60e01f38ff 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -36,7 +36,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
{
const struct nft_meta *priv = nft_expr_priv(expr);
const struct sk_buff *skb = pkt->skb;
- const struct net_device *in = pkt->in, *out = pkt->out;
+ const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
struct sock *sk;
u32 *dest = &regs->data[priv->dreg];
@@ -45,16 +45,15 @@ void nft_meta_get_eval(const struct nft_expr *expr,
*dest = skb->len;
break;
case NFT_META_PROTOCOL:
- *dest = 0;
- *(__be16 *)dest = skb->protocol;
+ nft_reg_store16(dest, (__force u16)skb->protocol);
break;
case NFT_META_NFPROTO:
- *dest = pkt->pf;
+ nft_reg_store8(dest, nft_pf(pkt));
break;
case NFT_META_L4PROTO:
if (!pkt->tprot_set)
goto err;
- *dest = pkt->tprot;
+ nft_reg_store8(dest, pkt->tprot);
break;
case NFT_META_PRIORITY:
*dest = skb->priority;
@@ -85,14 +84,12 @@ void nft_meta_get_eval(const struct nft_expr *expr,
case NFT_META_IIFTYPE:
if (in == NULL)
goto err;
- *dest = 0;
- *(u16 *)dest = in->type;
+ nft_reg_store16(dest, in->type);
break;
case NFT_META_OIFTYPE:
if (out == NULL)
goto err;
- *dest = 0;
- *(u16 *)dest = out->type;
+ nft_reg_store16(dest, out->type);
break;
case NFT_META_SKUID:
sk = skb_to_full_sk(skb);
@@ -142,25 +139,48 @@ void nft_meta_get_eval(const struct nft_expr *expr,
#endif
case NFT_META_PKTTYPE:
if (skb->pkt_type != PACKET_LOOPBACK) {
- *dest = skb->pkt_type;
+ nft_reg_store8(dest, skb->pkt_type);
break;
}
- switch (pkt->pf) {
+ switch (nft_pf(pkt)) {
case NFPROTO_IPV4:
if (ipv4_is_multicast(ip_hdr(skb)->daddr))
- *dest = PACKET_MULTICAST;
+ nft_reg_store8(dest, PACKET_MULTICAST);
else
- *dest = PACKET_BROADCAST;
+ nft_reg_store8(dest, PACKET_BROADCAST);
break;
case NFPROTO_IPV6:
- if (ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF)
- *dest = PACKET_MULTICAST;
- else
- *dest = PACKET_BROADCAST;
+ nft_reg_store8(dest, PACKET_MULTICAST);
+ break;
+ case NFPROTO_NETDEV:
+ switch (skb->protocol) {
+ case htons(ETH_P_IP): {
+ int noff = skb_network_offset(skb);
+ struct iphdr *iph, _iph;
+
+ iph = skb_header_pointer(skb, noff,
+ sizeof(_iph), &_iph);
+ if (!iph)
+ goto err;
+
+ if (ipv4_is_multicast(iph->daddr))
+ nft_reg_store8(dest, PACKET_MULTICAST);
+ else
+ nft_reg_store8(dest, PACKET_BROADCAST);
+
+ break;
+ }
+ case htons(ETH_P_IPV6):
+ nft_reg_store8(dest, PACKET_MULTICAST);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ goto err;
+ }
break;
default:
- WARN_ON(1);
+ WARN_ON_ONCE(1);
goto err;
}
break;
@@ -207,7 +227,9 @@ void nft_meta_set_eval(const struct nft_expr *expr,
{
const struct nft_meta *meta = nft_expr_priv(expr);
struct sk_buff *skb = pkt->skb;
- u32 value = regs->data[meta->sreg];
+ u32 *sreg = &regs->data[meta->sreg];
+ u32 value = *sreg;
+ u8 pkt_type;
switch (meta->key) {
case NFT_META_MARK:
@@ -217,9 +239,12 @@ void nft_meta_set_eval(const struct nft_expr *expr,
skb->priority = value;
break;
case NFT_META_PKTTYPE:
- if (skb->pkt_type != value &&
- skb_pkt_type_ok(value) && skb_pkt_type_ok(skb->pkt_type))
- skb->pkt_type = value;
+ pkt_type = nft_reg_load8(sreg);
+
+ if (skb->pkt_type != pkt_type &&
+ skb_pkt_type_ok(pkt_type) &&
+ skb_pkt_type_ok(skb->pkt_type))
+ skb->pkt_type = pkt_type;
break;
case NFT_META_NFTRACE:
skb->nf_trace = !!value;
@@ -310,6 +335,11 @@ int nft_meta_set_validate(const struct nft_ctx *ctx,
case NFPROTO_NETDEV:
hooks = 1 << NF_NETDEV_INGRESS;
break;
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ case NFPROTO_INET:
+ hooks = 1 << NF_INET_PRE_ROUTING;
+ break;
default:
return -EOPNOTSUPP;
}
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index ee2d71753746..439e0bd152a0 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -65,10 +65,10 @@ static void nft_nat_eval(const struct nft_expr *expr,
}
if (priv->sreg_proto_min) {
- range.min_proto.all =
- *(__be16 *)&regs->data[priv->sreg_proto_min];
- range.max_proto.all =
- *(__be16 *)&regs->data[priv->sreg_proto_max];
+ range.min_proto.all = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg_proto_min]);
+ range.max_proto.all = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg_proto_max]);
range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
}
@@ -209,7 +209,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(ctx->net, family);
}
static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -257,12 +257,21 @@ nla_put_failure:
return -1;
}
+static void
+nft_nat_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ const struct nft_nat *priv = nft_expr_priv(expr);
+
+ nf_ct_netns_put(ctx->net, priv->family);
+}
+
static struct nft_expr_type nft_nat_type;
static const struct nft_expr_ops nft_nat_ops = {
.type = &nft_nat_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_nat)),
.eval = nft_nat_eval,
.init = nft_nat_init,
+ .destroy = nft_nat_destroy,
.dump = nft_nat_dump,
.validate = nft_nat_validate,
};
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 55bc5ab78d4a..a66b36097b8f 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -65,7 +65,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
return -EOVERFLOW;
priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
- atomic_set(&priv->counter, 0);
+ atomic_set(&priv->counter, priv->modulus - 1);
return nft_validate_register_store(ctx, priv->dreg, NULL,
NFT_DATA_VALUE, sizeof(u32));
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
new file mode 100644
index 000000000000..1ae8c49ca4a1
--- /dev/null
+++ b/net/netfilter/nft_objref.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2012-2016 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+#define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr))
+
+static void nft_objref_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_object *obj = nft_objref_priv(expr);
+
+ obj->type->eval(obj, regs, pkt);
+}
+
+static int nft_objref_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_object *obj = nft_objref_priv(expr);
+ u8 genmask = nft_genmask_next(ctx->net);
+ u32 objtype;
+
+ if (!tb[NFTA_OBJREF_IMM_NAME] ||
+ !tb[NFTA_OBJREF_IMM_TYPE])
+ return -EINVAL;
+
+ objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE]));
+ obj = nf_tables_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype,
+ genmask);
+ if (IS_ERR(obj))
+ return -ENOENT;
+
+ nft_objref_priv(expr) = obj;
+ obj->use++;
+
+ return 0;
+}
+
+static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_object *obj = nft_objref_priv(expr);
+
+ if (nla_put_string(skb, NFTA_OBJREF_IMM_NAME, obj->name) ||
+ nla_put_be32(skb, NFTA_OBJREF_IMM_TYPE, htonl(obj->type->type)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static void nft_objref_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_object *obj = nft_objref_priv(expr);
+
+ obj->use--;
+}
+
+static struct nft_expr_type nft_objref_type;
+static const struct nft_expr_ops nft_objref_ops = {
+ .type = &nft_objref_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_object *)),
+ .eval = nft_objref_eval,
+ .init = nft_objref_init,
+ .destroy = nft_objref_destroy,
+ .dump = nft_objref_dump,
+};
+
+struct nft_objref_map {
+ struct nft_set *set;
+ enum nft_registers sreg:8;
+ struct nft_set_binding binding;
+};
+
+static void nft_objref_map_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_objref_map *priv = nft_expr_priv(expr);
+ const struct nft_set *set = priv->set;
+ const struct nft_set_ext *ext;
+ struct nft_object *obj;
+ bool found;
+
+ found = set->ops->lookup(nft_net(pkt), set, &regs->data[priv->sreg],
+ &ext);
+ if (!found) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ obj = *nft_set_ext_obj(ext);
+ obj->type->eval(obj, regs, pkt);
+}
+
+static int nft_objref_map_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_objref_map *priv = nft_expr_priv(expr);
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_set *set;
+ int err;
+
+ set = nf_tables_set_lookup(ctx->table, tb[NFTA_OBJREF_SET_NAME], genmask);
+ if (IS_ERR(set)) {
+ if (tb[NFTA_OBJREF_SET_ID]) {
+ set = nf_tables_set_lookup_byid(ctx->net,
+ tb[NFTA_OBJREF_SET_ID],
+ genmask);
+ }
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ }
+
+ if (!(set->flags & NFT_SET_OBJECT))
+ return -EINVAL;
+
+ priv->sreg = nft_parse_register(tb[NFTA_OBJREF_SET_SREG]);
+ err = nft_validate_register_load(priv->sreg, set->klen);
+ if (err < 0)
+ return err;
+
+ priv->binding.flags = set->flags & NFT_SET_OBJECT;
+
+ err = nf_tables_bind_set(ctx, set, &priv->binding);
+ if (err < 0)
+ return err;
+
+ priv->set = set;
+ return 0;
+}
+
+static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_objref_map *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_OBJREF_SET_SREG, priv->sreg) ||
+ nla_put_string(skb, NFTA_OBJREF_SET_NAME, priv->set->name))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static void nft_objref_map_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_objref_map *priv = nft_expr_priv(expr);
+
+ nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+}
+
+static struct nft_expr_type nft_objref_type;
+static const struct nft_expr_ops nft_objref_map_ops = {
+ .type = &nft_objref_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)),
+ .eval = nft_objref_map_eval,
+ .init = nft_objref_map_init,
+ .destroy = nft_objref_map_destroy,
+ .dump = nft_objref_map_dump,
+};
+
+static const struct nft_expr_ops *
+nft_objref_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_OBJREF_SET_SREG] &&
+ (tb[NFTA_OBJREF_SET_NAME] ||
+ tb[NFTA_OBJREF_SET_ID]))
+ return &nft_objref_map_ops;
+ else if (tb[NFTA_OBJREF_IMM_NAME] &&
+ tb[NFTA_OBJREF_IMM_TYPE])
+ return &nft_objref_ops;
+
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = {
+ [NFTA_OBJREF_IMM_NAME] = { .type = NLA_STRING,
+ .len = NFT_OBJ_MAXNAMELEN - 1 },
+ [NFTA_OBJREF_IMM_TYPE] = { .type = NLA_U32 },
+ [NFTA_OBJREF_SET_SREG] = { .type = NLA_U32 },
+ [NFTA_OBJREF_SET_NAME] = { .type = NLA_STRING,
+ .len = NFT_SET_MAXNAMELEN - 1 },
+ [NFTA_OBJREF_SET_ID] = { .type = NLA_U32 },
+};
+
+static struct nft_expr_type nft_objref_type __read_mostly = {
+ .name = "objref",
+ .select_ops = nft_objref_select_ops,
+ .policy = nft_objref_policy,
+ .maxattr = NFTA_OBJREF_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_objref_module_init(void)
+{
+ return nft_register_expr(&nft_objref_type);
+}
+
+static void __exit nft_objref_module_exit(void)
+{
+ nft_unregister_expr(&nft_objref_type);
+}
+
+module_init(nft_objref_module_init);
+module_exit(nft_objref_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("objref");
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index b2f88617611a..7d699bbd45b0 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -17,6 +18,10 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
+/* For layer 4 checksum field offset. */
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmpv6.h>
/* add vlan header into the user buffer for if tag was removed by offloads */
static bool
@@ -148,7 +153,6 @@ nla_put_failure:
return -1;
}
-static struct nft_expr_type nft_payload_type;
static const struct nft_expr_ops nft_payload_ops = {
.type = &nft_payload_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_payload)),
@@ -165,6 +169,103 @@ const struct nft_expr_ops nft_payload_fast_ops = {
.dump = nft_payload_dump,
};
+static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum)
+{
+ *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum));
+ if (*sum == 0)
+ *sum = CSUM_MANGLED_0;
+}
+
+static bool nft_payload_udp_checksum(struct sk_buff *skb, unsigned int thoff)
+{
+ struct udphdr *uh, _uh;
+
+ uh = skb_header_pointer(skb, thoff, sizeof(_uh), &_uh);
+ if (!uh)
+ return false;
+
+ return uh->check;
+}
+
+static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt,
+ struct sk_buff *skb,
+ unsigned int *l4csum_offset)
+{
+ switch (pkt->tprot) {
+ case IPPROTO_TCP:
+ *l4csum_offset = offsetof(struct tcphdr, check);
+ break;
+ case IPPROTO_UDP:
+ if (!nft_payload_udp_checksum(skb, pkt->xt.thoff))
+ return -1;
+ /* Fall through. */
+ case IPPROTO_UDPLITE:
+ *l4csum_offset = offsetof(struct udphdr, check);
+ break;
+ case IPPROTO_ICMPV6:
+ *l4csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
+ break;
+ default:
+ return -1;
+ }
+
+ *l4csum_offset += pkt->xt.thoff;
+ return 0;
+}
+
+static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt,
+ struct sk_buff *skb,
+ __wsum fsum, __wsum tsum)
+{
+ int l4csum_offset;
+ __sum16 sum;
+
+ /* If we cannot determine layer 4 checksum offset or this packet doesn't
+ * require layer 4 checksum recalculation, skip this packet.
+ */
+ if (nft_payload_l4csum_offset(pkt, skb, &l4csum_offset) < 0)
+ return 0;
+
+ if (skb_copy_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0)
+ return -1;
+
+ /* Checksum mangling for an arbitrary amount of bytes, based on
+ * inet_proto_csum_replace*() functions.
+ */
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ nft_csum_replace(&sum, fsum, tsum);
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ skb->csum = ~csum_add(csum_sub(~(skb->csum), fsum),
+ tsum);
+ }
+ } else {
+ sum = ~csum_fold(csum_add(csum_sub(csum_unfold(sum), fsum),
+ tsum));
+ }
+
+ if (!skb_make_writable(skb, l4csum_offset + sizeof(sum)) ||
+ skb_store_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src,
+ __wsum fsum, __wsum tsum, int csum_offset)
+{
+ __sum16 sum;
+
+ if (skb_copy_bits(skb, csum_offset, &sum, sizeof(sum)) < 0)
+ return -1;
+
+ nft_csum_replace(&sum, fsum, tsum);
+ if (!skb_make_writable(skb, csum_offset + sizeof(sum)) ||
+ skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0)
+ return -1;
+
+ return 0;
+}
+
static void nft_payload_set_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -174,7 +275,6 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
const u32 *src = &regs->data[priv->sreg];
int offset, csum_offset;
__wsum fsum, tsum;
- __sum16 sum;
switch (priv->base) {
case NFT_PAYLOAD_LL_HEADER:
@@ -197,21 +297,18 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
csum_offset = offset + priv->csum_offset;
offset += priv->offset;
- if (priv->csum_type == NFT_PAYLOAD_CSUM_INET &&
+ if ((priv->csum_type == NFT_PAYLOAD_CSUM_INET || priv->csum_flags) &&
(priv->base != NFT_PAYLOAD_TRANSPORT_HEADER ||
skb->ip_summed != CHECKSUM_PARTIAL)) {
- if (skb_copy_bits(skb, csum_offset, &sum, sizeof(sum)) < 0)
- goto err;
-
fsum = skb_checksum(skb, offset, priv->len, 0);
tsum = csum_partial(src, priv->len, 0);
- sum = csum_fold(csum_add(csum_sub(~csum_unfold(sum), fsum),
- tsum));
- if (sum == 0)
- sum = CSUM_MANGLED_0;
- if (!skb_make_writable(skb, csum_offset + sizeof(sum)) ||
- skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0)
+ if (priv->csum_type == NFT_PAYLOAD_CSUM_INET &&
+ nft_payload_csum_inet(skb, src, fsum, tsum, csum_offset))
+ goto err;
+
+ if (priv->csum_flags &&
+ nft_payload_l4csum_update(pkt, skb, fsum, tsum) < 0)
goto err;
}
@@ -241,6 +338,15 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
if (tb[NFTA_PAYLOAD_CSUM_OFFSET])
priv->csum_offset =
ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET]));
+ if (tb[NFTA_PAYLOAD_CSUM_FLAGS]) {
+ u32 flags;
+
+ flags = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_FLAGS]));
+ if (flags & ~NFT_PAYLOAD_L4CSUM_PSEUDOHDR)
+ return -EINVAL;
+
+ priv->csum_flags = flags;
+ }
switch (priv->csum_type) {
case NFT_PAYLOAD_CSUM_NONE:
@@ -263,7 +369,8 @@ static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr
nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len)) ||
nla_put_be32(skb, NFTA_PAYLOAD_CSUM_TYPE, htonl(priv->csum_type)) ||
nla_put_be32(skb, NFTA_PAYLOAD_CSUM_OFFSET,
- htonl(priv->csum_offset)))
+ htonl(priv->csum_offset)) ||
+ nla_put_be32(skb, NFTA_PAYLOAD_CSUM_FLAGS, htonl(priv->csum_flags)))
goto nla_put_failure;
return 0;
@@ -320,20 +427,10 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
return &nft_payload_ops;
}
-static struct nft_expr_type nft_payload_type __read_mostly = {
+struct nft_expr_type nft_payload_type __read_mostly = {
.name = "payload",
.select_ops = nft_payload_select_ops,
.policy = nft_payload_policy,
.maxattr = NFTA_PAYLOAD_MAX,
.owner = THIS_MODULE,
};
-
-int __init nft_payload_module_init(void)
-{
- return nft_register_expr(&nft_payload_type);
-}
-
-void nft_payload_module_exit(void)
-{
- nft_unregister_expr(&nft_payload_type);
-}
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index 393d359a1889..dbb6aaff67ec 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -38,12 +38,12 @@ static void nft_queue_eval(const struct nft_expr *expr,
if (priv->queues_total > 1) {
if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) {
- int cpu = smp_processor_id();
+ int cpu = raw_smp_processor_id();
queue = priv->queuenum + cpu % priv->queues_total;
} else {
queue = nfqueue_hash(pkt->skb, queue,
- priv->queues_total, pkt->pf,
+ priv->queues_total, nft_pf(pkt),
jhash_initval);
}
}
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index c00104c07095..2d6fe3559912 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -17,38 +17,59 @@
struct nft_quota {
u64 quota;
- bool invert;
- atomic64_t remain;
+ unsigned long flags;
+ atomic64_t consumed;
};
static inline bool nft_overquota(struct nft_quota *priv,
- const struct nft_pktinfo *pkt)
+ const struct sk_buff *skb)
{
- return atomic64_sub_return(pkt->skb->len, &priv->remain) < 0;
+ return atomic64_add_return(skb->len, &priv->consumed) >= priv->quota;
}
-static void nft_quota_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static inline bool nft_quota_invert(struct nft_quota *priv)
{
- struct nft_quota *priv = nft_expr_priv(expr);
+ return priv->flags & NFT_QUOTA_F_INV;
+}
- if (nft_overquota(priv, pkt) ^ priv->invert)
+static inline void nft_quota_do_eval(struct nft_quota *priv,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv))
regs->verdict.code = NFT_BREAK;
}
static const struct nla_policy nft_quota_policy[NFTA_QUOTA_MAX + 1] = {
[NFTA_QUOTA_BYTES] = { .type = NLA_U64 },
[NFTA_QUOTA_FLAGS] = { .type = NLA_U32 },
+ [NFTA_QUOTA_CONSUMED] = { .type = NLA_U64 },
};
-static int nft_quota_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+#define NFT_QUOTA_DEPLETED_BIT 1 /* From NFT_QUOTA_F_DEPLETED. */
+
+static void nft_quota_obj_eval(struct nft_object *obj,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
- struct nft_quota *priv = nft_expr_priv(expr);
- u32 flags = 0;
- u64 quota;
+ struct nft_quota *priv = nft_obj_data(obj);
+ bool overquota;
+
+ overquota = nft_overquota(priv, pkt->skb);
+ if (overquota ^ nft_quota_invert(priv))
+ regs->verdict.code = NFT_BREAK;
+
+ if (overquota &&
+ !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags))
+ nft_obj_notify(nft_net(pkt), obj->table, obj, 0, 0,
+ NFT_MSG_NEWOBJ, nft_pf(pkt), 0, GFP_ATOMIC);
+}
+
+static int nft_quota_do_init(const struct nlattr * const tb[],
+ struct nft_quota *priv)
+{
+ unsigned long flags = 0;
+ u64 quota, consumed = 0;
if (!tb[NFTA_QUOTA_BYTES])
return -EINVAL;
@@ -57,34 +78,114 @@ static int nft_quota_init(const struct nft_ctx *ctx,
if (quota > S64_MAX)
return -EOVERFLOW;
+ if (tb[NFTA_QUOTA_CONSUMED]) {
+ consumed = be64_to_cpu(nla_get_be64(tb[NFTA_QUOTA_CONSUMED]));
+ if (consumed > quota)
+ return -EINVAL;
+ }
+
if (tb[NFTA_QUOTA_FLAGS]) {
flags = ntohl(nla_get_be32(tb[NFTA_QUOTA_FLAGS]));
if (flags & ~NFT_QUOTA_F_INV)
return -EINVAL;
+ if (flags & NFT_QUOTA_F_DEPLETED)
+ return -EOPNOTSUPP;
}
priv->quota = quota;
- priv->invert = (flags & NFT_QUOTA_F_INV) ? true : false;
- atomic64_set(&priv->remain, quota);
+ priv->flags = flags;
+ atomic64_set(&priv->consumed, consumed);
return 0;
}
-static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_quota_obj_init(const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+ struct nft_quota *priv = nft_obj_data(obj);
+
+ return nft_quota_do_init(tb, priv);
+}
+
+static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv,
+ bool reset)
{
- const struct nft_quota *priv = nft_expr_priv(expr);
- u32 flags = priv->invert ? NFT_QUOTA_F_INV : 0;
+ u64 consumed, consumed_cap;
+ u32 flags = priv->flags;
+
+ /* Since we inconditionally increment consumed quota for each packet
+ * that we see, don't go over the quota boundary in what we send to
+ * userspace.
+ */
+ consumed = atomic64_read(&priv->consumed);
+ if (consumed >= priv->quota) {
+ consumed_cap = priv->quota;
+ flags |= NFT_QUOTA_F_DEPLETED;
+ } else {
+ consumed_cap = consumed;
+ }
if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota),
NFTA_QUOTA_PAD) ||
+ nla_put_be64(skb, NFTA_QUOTA_CONSUMED, cpu_to_be64(consumed_cap),
+ NFTA_QUOTA_PAD) ||
nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags)))
goto nla_put_failure;
+
+ if (reset) {
+ atomic64_sub(consumed, &priv->consumed);
+ clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags);
+ }
return 0;
nla_put_failure:
return -1;
}
+static int nft_quota_obj_dump(struct sk_buff *skb, struct nft_object *obj,
+ bool reset)
+{
+ struct nft_quota *priv = nft_obj_data(obj);
+
+ return nft_quota_do_dump(skb, priv, reset);
+}
+
+static struct nft_object_type nft_quota_obj __read_mostly = {
+ .type = NFT_OBJECT_QUOTA,
+ .size = sizeof(struct nft_quota),
+ .maxattr = NFTA_QUOTA_MAX,
+ .policy = nft_quota_policy,
+ .init = nft_quota_obj_init,
+ .eval = nft_quota_obj_eval,
+ .dump = nft_quota_obj_dump,
+ .owner = THIS_MODULE,
+};
+
+static void nft_quota_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_quota *priv = nft_expr_priv(expr);
+
+ nft_quota_do_eval(priv, regs, pkt);
+}
+
+static int nft_quota_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_quota *priv = nft_expr_priv(expr);
+
+ return nft_quota_do_init(tb, priv);
+}
+
+static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_quota *priv = nft_expr_priv(expr);
+
+ return nft_quota_do_dump(skb, priv, false);
+}
+
static struct nft_expr_type nft_quota_type;
static const struct nft_expr_ops nft_quota_ops = {
.type = &nft_quota_type,
@@ -105,12 +206,26 @@ static struct nft_expr_type nft_quota_type __read_mostly = {
static int __init nft_quota_module_init(void)
{
- return nft_register_expr(&nft_quota_type);
+ int err;
+
+ err = nft_register_obj(&nft_quota_obj);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_quota_type);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+err1:
+ nft_unregister_obj(&nft_quota_obj);
+ return err;
}
static void __exit nft_quota_module_exit(void)
{
- nft_unregister_expr(&nft_quota_type);
+ nft_unregister_expr(&nft_quota_type);
+ nft_unregister_obj(&nft_quota_obj);
}
module_init(nft_quota_module_init);
@@ -119,3 +234,4 @@ module_exit(nft_quota_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_EXPR("quota");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_QUOTA);
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
index 8f0aaaea1376..9edc74eedc10 100644
--- a/net/netfilter/nft_range.c
+++ b/net/netfilter/nft_range.c
@@ -128,7 +128,6 @@ nla_put_failure:
return -1;
}
-static struct nft_expr_type nft_range_type;
static const struct nft_expr_ops nft_range_ops = {
.type = &nft_range_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_range_expr)),
@@ -137,20 +136,10 @@ static const struct nft_expr_ops nft_range_ops = {
.dump = nft_range_dump,
};
-static struct nft_expr_type nft_range_type __read_mostly = {
+struct nft_expr_type nft_range_type __read_mostly = {
.name = "range",
.ops = &nft_range_ops,
.policy = nft_range_policy,
.maxattr = NFTA_RANGE_MAX,
.owner = THIS_MODULE,
};
-
-int __init nft_range_module_init(void)
-{
- return nft_register_expr(&nft_range_type);
-}
-
-void nft_range_module_exit(void)
-{
- nft_unregister_expr(&nft_range_type);
-}
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index 03f7bf40ae75..40dcd05146d5 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -79,7 +79,7 @@ int nft_redir_init(const struct nft_ctx *ctx,
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(ctx->net, ctx->afi->family);
}
EXPORT_SYMBOL_GPL(nft_redir_init);
@@ -108,4 +108,4 @@ nla_put_failure:
EXPORT_SYMBOL_GPL(nft_redir_dump);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index e79d9ca2ffee..9e90a02cb104 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -23,36 +23,36 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
{
struct nft_reject *priv = nft_expr_priv(expr);
- switch (pkt->pf) {
+ switch (nft_pf(pkt)) {
case NFPROTO_IPV4:
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
nf_send_unreach(pkt->skb, priv->icmp_code,
- pkt->hook);
+ nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset(pkt->net, pkt->skb, pkt->hook);
+ nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt));
break;
case NFT_REJECT_ICMPX_UNREACH:
nf_send_unreach(pkt->skb,
nft_reject_icmp_code(priv->icmp_code),
- pkt->hook);
+ nft_hook(pkt));
break;
}
break;
case NFPROTO_IPV6:
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nf_send_unreach6(pkt->net, pkt->skb, priv->icmp_code,
- pkt->hook);
+ nf_send_unreach6(nft_net(pkt), pkt->skb,
+ priv->icmp_code, nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset6(pkt->net, pkt->skb, pkt->hook);
+ nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt));
break;
case NFT_REJECT_ICMPX_UNREACH:
- nf_send_unreach6(pkt->net, pkt->skb,
+ nf_send_unreach6(nft_net(pkt), pkt->skb,
nft_reject_icmpv6_code(priv->icmp_code),
- pkt->hook);
+ nft_hook(pkt));
break;
}
break;
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
new file mode 100644
index 000000000000..d3eb640bc784
--- /dev/null
+++ b/net/netfilter/nft_rt.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2016 Anders K. Pedersen <akp@cohaesio.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/dst.h>
+#include <net/ip6_route.h>
+#include <net/route.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+
+struct nft_rt {
+ enum nft_rt_keys key:8;
+ enum nft_registers dreg:8;
+};
+
+void nft_rt_get_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_rt *priv = nft_expr_priv(expr);
+ const struct sk_buff *skb = pkt->skb;
+ u32 *dest = &regs->data[priv->dreg];
+ const struct dst_entry *dst;
+
+ dst = skb_dst(skb);
+ if (!dst)
+ goto err;
+
+ switch (priv->key) {
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ case NFT_RT_CLASSID:
+ *dest = dst->tclassid;
+ break;
+#endif
+ case NFT_RT_NEXTHOP4:
+ if (nft_pf(pkt) != NFPROTO_IPV4)
+ goto err;
+
+ *dest = rt_nexthop((const struct rtable *)dst,
+ ip_hdr(skb)->daddr);
+ break;
+ case NFT_RT_NEXTHOP6:
+ if (nft_pf(pkt) != NFPROTO_IPV6)
+ goto err;
+
+ memcpy(dest, rt6_nexthop((struct rt6_info *)dst,
+ &ipv6_hdr(skb)->daddr),
+ sizeof(struct in6_addr));
+ break;
+ default:
+ WARN_ON(1);
+ goto err;
+ }
+ return;
+
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+const struct nla_policy nft_rt_policy[NFTA_RT_MAX + 1] = {
+ [NFTA_RT_DREG] = { .type = NLA_U32 },
+ [NFTA_RT_KEY] = { .type = NLA_U32 },
+};
+
+int nft_rt_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_rt *priv = nft_expr_priv(expr);
+ unsigned int len;
+
+ if (tb[NFTA_RT_KEY] == NULL ||
+ tb[NFTA_RT_DREG] == NULL)
+ return -EINVAL;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_RT_KEY]));
+ switch (priv->key) {
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ case NFT_RT_CLASSID:
+#endif
+ case NFT_RT_NEXTHOP4:
+ len = sizeof(u32);
+ break;
+ case NFT_RT_NEXTHOP6:
+ len = sizeof(struct in6_addr);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->dreg = nft_parse_register(tb[NFTA_RT_DREG]);
+ return nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, len);
+}
+
+int nft_rt_get_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_rt *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_RT_KEY, htonl(priv->key)))
+ goto nla_put_failure;
+ if (nft_dump_register(skb, NFTA_RT_DREG, priv->dreg))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_rt_type;
+static const struct nft_expr_ops nft_rt_get_ops = {
+ .type = &nft_rt_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_rt)),
+ .eval = nft_rt_get_eval,
+ .init = nft_rt_get_init,
+ .dump = nft_rt_get_dump,
+};
+
+static struct nft_expr_type nft_rt_type __read_mostly = {
+ .name = "rt",
+ .ops = &nft_rt_get_ops,
+ .policy = nft_rt_policy,
+ .maxattr = NFTA_RT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_rt_module_init(void)
+{
+ return nft_register_expr(&nft_rt_type);
+}
+
+static void __exit nft_rt_module_exit(void)
+{
+ nft_unregister_expr(&nft_rt_type);
+}
+
+module_init(nft_rt_module_init);
+module_exit(nft_rt_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Anders K. Pedersen <akp@cohaesio.com>");
+MODULE_ALIAS_NFT_EXPR("rt");
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
new file mode 100644
index 000000000000..8ebbc2940f4c
--- /dev/null
+++ b/net/netfilter/nft_set_bitmap.c
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2017 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_bitmap_elem {
+ struct list_head head;
+ struct nft_set_ext ext;
+};
+
+/* This bitmap uses two bits to represent one element. These two bits determine
+ * the element state in the current and the future generation.
+ *
+ * An element can be in three states. The generation cursor is represented using
+ * the ^ character, note that this cursor shifts on every succesful transaction.
+ * If no transaction is going on, we observe all elements are in the following
+ * state:
+ *
+ * 11 = this element is active in the current generation. In case of no updates,
+ * ^ it stays active in the next generation.
+ * 00 = this element is inactive in the current generation. In case of no
+ * ^ updates, it stays inactive in the next generation.
+ *
+ * On transaction handling, we observe these two temporary states:
+ *
+ * 01 = this element is inactive in the current generation and it becomes active
+ * ^ in the next one. This happens when the element is inserted but commit
+ * path has not yet been executed yet, so activation is still pending. On
+ * transaction abortion, the element is removed.
+ * 10 = this element is active in the current generation and it becomes inactive
+ * ^ in the next one. This happens when the element is deactivated but commit
+ * path has not yet been executed yet, so removal is still pending. On
+ * transation abortion, the next generation bit is reset to go back to
+ * restore its previous state.
+ */
+struct nft_bitmap {
+ struct list_head list;
+ u16 bitmap_size;
+ u8 bitmap[];
+};
+
+static inline void nft_bitmap_location(const struct nft_set *set,
+ const void *key,
+ u32 *idx, u32 *off)
+{
+ u32 k;
+
+ if (set->klen == 2)
+ k = *(u16 *)key;
+ else
+ k = *(u8 *)key;
+ k <<= 1;
+
+ *idx = k / BITS_PER_BYTE;
+ *off = k % BITS_PER_BYTE;
+}
+
+/* Fetch the two bits that represent the element and check if it is active based
+ * on the generation mask.
+ */
+static inline bool
+nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask)
+{
+ return (bitmap[idx] & (0x3 << off)) & (genmask << off);
+}
+
+static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
+{
+ const struct nft_bitmap *priv = nft_set_priv(set);
+ u8 genmask = nft_genmask_cur(net);
+ u32 idx, off;
+
+ nft_bitmap_location(set, key, &idx, &off);
+
+ return nft_bitmap_active(priv->bitmap, idx, off, genmask);
+}
+
+static struct nft_bitmap_elem *
+nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this,
+ u8 genmask)
+{
+ const struct nft_bitmap *priv = nft_set_priv(set);
+ struct nft_bitmap_elem *be;
+
+ list_for_each_entry_rcu(be, &priv->list, head) {
+ if (memcmp(nft_set_ext_key(&be->ext),
+ nft_set_ext_key(&this->ext), set->klen) ||
+ !nft_set_elem_active(&be->ext, genmask))
+ continue;
+
+ return be;
+ }
+ return NULL;
+}
+
+static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem,
+ struct nft_set_ext **ext)
+{
+ struct nft_bitmap *priv = nft_set_priv(set);
+ struct nft_bitmap_elem *new = elem->priv, *be;
+ u8 genmask = nft_genmask_next(net);
+ u32 idx, off;
+
+ be = nft_bitmap_elem_find(set, new, genmask);
+ if (be) {
+ *ext = &be->ext;
+ return -EEXIST;
+ }
+
+ nft_bitmap_location(set, nft_set_ext_key(&new->ext), &idx, &off);
+ /* Enter 01 state. */
+ priv->bitmap[idx] |= (genmask << off);
+ list_add_tail_rcu(&new->head, &priv->list);
+
+ return 0;
+}
+
+static void nft_bitmap_remove(const struct net *net,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_bitmap *priv = nft_set_priv(set);
+ struct nft_bitmap_elem *be = elem->priv;
+ u8 genmask = nft_genmask_next(net);
+ u32 idx, off;
+
+ nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
+ /* Enter 00 state. */
+ priv->bitmap[idx] &= ~(genmask << off);
+ list_del_rcu(&be->head);
+}
+
+static void nft_bitmap_activate(const struct net *net,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_bitmap *priv = nft_set_priv(set);
+ struct nft_bitmap_elem *be = elem->priv;
+ u8 genmask = nft_genmask_next(net);
+ u32 idx, off;
+
+ nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
+ /* Enter 11 state. */
+ priv->bitmap[idx] |= (genmask << off);
+ nft_set_elem_change_active(net, set, &be->ext);
+}
+
+static bool nft_bitmap_flush(const struct net *net,
+ const struct nft_set *set, void *_be)
+{
+ struct nft_bitmap *priv = nft_set_priv(set);
+ u8 genmask = nft_genmask_next(net);
+ struct nft_bitmap_elem *be = _be;
+ u32 idx, off;
+
+ nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
+ /* Enter 10 state, similar to deactivation. */
+ priv->bitmap[idx] &= ~(genmask << off);
+ nft_set_elem_change_active(net, set, &be->ext);
+
+ return true;
+}
+
+static void *nft_bitmap_deactivate(const struct net *net,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_bitmap *priv = nft_set_priv(set);
+ struct nft_bitmap_elem *this = elem->priv, *be;
+ u8 genmask = nft_genmask_next(net);
+ u32 idx, off;
+
+ nft_bitmap_location(set, elem->key.val.data, &idx, &off);
+
+ be = nft_bitmap_elem_find(set, this, genmask);
+ if (!be)
+ return NULL;
+
+ /* Enter 10 state. */
+ priv->bitmap[idx] &= ~(genmask << off);
+ nft_set_elem_change_active(net, set, &be->ext);
+
+ return be;
+}
+
+static void nft_bitmap_walk(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ const struct nft_bitmap *priv = nft_set_priv(set);
+ struct nft_bitmap_elem *be;
+ struct nft_set_elem elem;
+
+ list_for_each_entry_rcu(be, &priv->list, head) {
+ if (iter->count < iter->skip)
+ goto cont;
+ if (!nft_set_elem_active(&be->ext, iter->genmask))
+ goto cont;
+
+ elem.priv = be;
+
+ iter->err = iter->fn(ctx, set, iter, &elem);
+
+ if (iter->err < 0)
+ return;
+cont:
+ iter->count++;
+ }
+}
+
+/* The bitmap size is pow(2, key length in bits) / bits per byte. This is
+ * multiplied by two since each element takes two bits. For 8 bit keys, the
+ * bitmap consumes 66 bytes. For 16 bit keys, 16388 bytes.
+ */
+static inline u32 nft_bitmap_size(u32 klen)
+{
+ return ((2 << ((klen * BITS_PER_BYTE) - 1)) / BITS_PER_BYTE) << 1;
+}
+
+static inline u32 nft_bitmap_total_size(u32 klen)
+{
+ return sizeof(struct nft_bitmap) + nft_bitmap_size(klen);
+}
+
+static unsigned int nft_bitmap_privsize(const struct nlattr * const nla[])
+{
+ u32 klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));
+
+ return nft_bitmap_total_size(klen);
+}
+
+static int nft_bitmap_init(const struct nft_set *set,
+ const struct nft_set_desc *desc,
+ const struct nlattr * const nla[])
+{
+ struct nft_bitmap *priv = nft_set_priv(set);
+
+ INIT_LIST_HEAD(&priv->list);
+ priv->bitmap_size = nft_bitmap_size(set->klen);
+
+ return 0;
+}
+
+static void nft_bitmap_destroy(const struct nft_set *set)
+{
+}
+
+static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est)
+{
+ /* Make sure bitmaps we don't get bitmaps larger than 16 Kbytes. */
+ if (desc->klen > 2)
+ return false;
+
+ est->size = nft_bitmap_total_size(desc->klen);
+ est->lookup = NFT_SET_CLASS_O_1;
+ est->space = NFT_SET_CLASS_O_1;
+
+ return true;
+}
+
+static struct nft_set_ops nft_bitmap_ops __read_mostly = {
+ .privsize = nft_bitmap_privsize,
+ .elemsize = offsetof(struct nft_bitmap_elem, ext),
+ .estimate = nft_bitmap_estimate,
+ .init = nft_bitmap_init,
+ .destroy = nft_bitmap_destroy,
+ .insert = nft_bitmap_insert,
+ .remove = nft_bitmap_remove,
+ .deactivate = nft_bitmap_deactivate,
+ .flush = nft_bitmap_flush,
+ .activate = nft_bitmap_activate,
+ .lookup = nft_bitmap_lookup,
+ .walk = nft_bitmap_walk,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_bitmap_module_init(void)
+{
+ return nft_register_set(&nft_bitmap_ops);
+}
+
+static void __exit nft_bitmap_module_exit(void)
+{
+ nft_unregister_set(&nft_bitmap_ops);
+}
+
+module_init(nft_bitmap_module_init);
+module_exit(nft_bitmap_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index a3dface3e6e6..5f652720fc78 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -167,6 +167,19 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set,
nft_set_elem_clear_busy(&he->ext);
}
+static bool nft_hash_flush(const struct net *net,
+ const struct nft_set *set, void *priv)
+{
+ struct nft_hash_elem *he = priv;
+
+ if (!nft_set_elem_mark_busy(&he->ext) ||
+ !nft_is_active(net, &he->ext)) {
+ nft_set_elem_change_active(net, set, &he->ext);
+ return true;
+ }
+ return false;
+}
+
static void *nft_hash_deactivate(const struct net *net,
const struct nft_set *set,
const struct nft_set_elem *elem)
@@ -181,19 +194,17 @@ static void *nft_hash_deactivate(const struct net *net,
rcu_read_lock();
he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
- if (he != NULL) {
- if (!nft_set_elem_mark_busy(&he->ext) ||
- !nft_is_active(net, &he->ext))
- nft_set_elem_change_active(net, set, &he->ext);
- else
- he = NULL;
- }
+ if (he != NULL &&
+ !nft_hash_flush(net, set, he))
+ he = NULL;
+
rcu_read_unlock();
return he;
}
-static void nft_hash_remove(const struct nft_set *set,
+static void nft_hash_remove(const struct net *net,
+ const struct nft_set *set,
const struct nft_set_elem *elem)
{
struct nft_hash *priv = nft_set_priv(set);
@@ -202,7 +213,7 @@ static void nft_hash_remove(const struct nft_set *set,
rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params);
}
-static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
+static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_iter *iter)
{
struct nft_hash *priv = nft_set_priv(set);
@@ -373,7 +384,8 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
est->size = esize + 2 * sizeof(struct nft_hash_elem *);
}
- est->class = NFT_SET_CLASS_O_1;
+ est->lookup = NFT_SET_CLASS_O_1;
+ est->space = NFT_SET_CLASS_O_N;
return true;
}
@@ -387,11 +399,12 @@ static struct nft_set_ops nft_hash_ops __read_mostly = {
.insert = nft_hash_insert,
.activate = nft_hash_activate,
.deactivate = nft_hash_deactivate,
+ .flush = nft_hash_flush,
.remove = nft_hash_remove,
.lookup = nft_hash_lookup,
.update = nft_hash_update,
.walk = nft_hash_walk,
- .features = NFT_SET_MAP | NFT_SET_TIMEOUT,
+ .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
.owner = THIS_MODULE,
};
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 36493a7cae88..78dfbf9588b3 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -60,11 +60,10 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
d = memcmp(this, key, set->klen);
if (d < 0) {
parent = parent->rb_left;
- /* In case of adjacent ranges, we always see the high
- * part of the range in first place, before the low one.
- * So don't update interval if the keys are equal.
- */
- if (interval && nft_rbtree_equal(set, this, interval))
+ if (interval &&
+ nft_rbtree_equal(set, this, interval) &&
+ nft_rbtree_interval_end(this) &&
+ !nft_rbtree_interval_end(interval))
continue;
interval = rbe;
} else if (d > 0)
@@ -151,7 +150,8 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
return err;
}
-static void nft_rbtree_remove(const struct nft_set *set,
+static void nft_rbtree_remove(const struct net *net,
+ const struct nft_set *set,
const struct nft_set_elem *elem)
{
struct nft_rbtree *priv = nft_set_priv(set);
@@ -171,6 +171,15 @@ static void nft_rbtree_activate(const struct net *net,
nft_set_elem_change_active(net, set, &rbe->ext);
}
+static bool nft_rbtree_flush(const struct net *net,
+ const struct nft_set *set, void *priv)
+{
+ struct nft_rbtree_elem *rbe = priv;
+
+ nft_set_elem_change_active(net, set, &rbe->ext);
+ return true;
+}
+
static void *nft_rbtree_deactivate(const struct net *net,
const struct nft_set *set,
const struct nft_set_elem *elem)
@@ -204,7 +213,7 @@ static void *nft_rbtree_deactivate(const struct net *net,
parent = parent->rb_right;
continue;
}
- nft_set_elem_change_active(net, set, &rbe->ext);
+ nft_rbtree_flush(net, set, rbe);
return rbe;
}
}
@@ -212,7 +221,7 @@ static void *nft_rbtree_deactivate(const struct net *net,
}
static void nft_rbtree_walk(const struct nft_ctx *ctx,
- const struct nft_set *set,
+ struct nft_set *set,
struct nft_set_iter *iter)
{
const struct nft_rbtree *priv = nft_set_priv(set);
@@ -281,7 +290,8 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
else
est->size = nsize;
- est->class = NFT_SET_CLASS_O_LOG_N;
+ est->lookup = NFT_SET_CLASS_O_LOG_N;
+ est->space = NFT_SET_CLASS_O_N;
return true;
}
@@ -295,10 +305,11 @@ static struct nft_set_ops nft_rbtree_ops __read_mostly = {
.insert = nft_rbtree_insert,
.remove = nft_rbtree_remove,
.deactivate = nft_rbtree_deactivate,
+ .flush = nft_rbtree_flush,
.activate = nft_rbtree_activate,
.lookup = nft_rbtree_lookup,
.walk = nft_rbtree_walk,
- .features = NFT_SET_INTERVAL | NFT_SET_MAP,
+ .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT,
.owner = THIS_MODULE,
};
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index fc4977456c30..14857afc9937 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -40,6 +40,7 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+#define XT_PCPU_BLOCK_SIZE 4096
struct compat_delta {
unsigned int offset; /* offset in kernel */
@@ -261,6 +262,60 @@ struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision)
}
EXPORT_SYMBOL_GPL(xt_request_find_target);
+
+static int xt_obj_to_user(u16 __user *psize, u16 size,
+ void __user *pname, const char *name,
+ u8 __user *prev, u8 rev)
+{
+ if (put_user(size, psize))
+ return -EFAULT;
+ if (copy_to_user(pname, name, strlen(name) + 1))
+ return -EFAULT;
+ if (put_user(rev, prev))
+ return -EFAULT;
+
+ return 0;
+}
+
+#define XT_OBJ_TO_USER(U, K, TYPE, C_SIZE) \
+ xt_obj_to_user(&U->u.TYPE##_size, C_SIZE ? : K->u.TYPE##_size, \
+ U->u.user.name, K->u.kernel.TYPE->name, \
+ &U->u.user.revision, K->u.kernel.TYPE->revision)
+
+int xt_data_to_user(void __user *dst, const void *src,
+ int usersize, int size)
+{
+ usersize = usersize ? : size;
+ if (copy_to_user(dst, src, usersize))
+ return -EFAULT;
+ if (usersize != size && clear_user(dst + usersize, size - usersize))
+ return -EFAULT;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xt_data_to_user);
+
+#define XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \
+ xt_data_to_user(U->data, K->data, \
+ K->u.kernel.TYPE->usersize, \
+ C_SIZE ? : K->u.kernel.TYPE->TYPE##size)
+
+int xt_match_to_user(const struct xt_entry_match *m,
+ struct xt_entry_match __user *u)
+{
+ return XT_OBJ_TO_USER(u, m, match, 0) ||
+ XT_DATA_TO_USER(u, m, match, 0);
+}
+EXPORT_SYMBOL_GPL(xt_match_to_user);
+
+int xt_target_to_user(const struct xt_entry_target *t,
+ struct xt_entry_target __user *u)
+{
+ return XT_OBJ_TO_USER(u, t, target, 0) ||
+ XT_DATA_TO_USER(u, t, target, 0);
+}
+EXPORT_SYMBOL_GPL(xt_target_to_user);
+
static int match_revfn(u8 af, const char *name, u8 revision, int *bestp)
{
const struct xt_match *m;
@@ -564,17 +619,14 @@ int xt_compat_match_to_user(const struct xt_entry_match *m,
int off = xt_compat_match_offset(match);
u_int16_t msize = m->u.user.match_size - off;
- if (copy_to_user(cm, m, sizeof(*cm)) ||
- put_user(msize, &cm->u.user.match_size) ||
- copy_to_user(cm->u.user.name, m->u.kernel.match->name,
- strlen(m->u.kernel.match->name) + 1))
+ if (XT_OBJ_TO_USER(cm, m, match, msize))
return -EFAULT;
if (match->compat_to_user) {
if (match->compat_to_user((void __user *)cm->data, m->data))
return -EFAULT;
} else {
- if (copy_to_user(cm->data, m->data, msize - sizeof(*cm)))
+ if (XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm)))
return -EFAULT;
}
@@ -615,7 +667,7 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
COMPAT_XT_ALIGN(target_offset + sizeof(struct compat_xt_standard_target)) != next_offset)
return -EINVAL;
- /* compat_xt_entry match has less strict aligment requirements,
+ /* compat_xt_entry match has less strict alignment requirements,
* otherwise they are identical. In case of padding differences
* we need to add compat version of xt_check_entry_match.
*/
@@ -922,17 +974,14 @@ int xt_compat_target_to_user(const struct xt_entry_target *t,
int off = xt_compat_target_offset(target);
u_int16_t tsize = t->u.user.target_size - off;
- if (copy_to_user(ct, t, sizeof(*ct)) ||
- put_user(tsize, &ct->u.user.target_size) ||
- copy_to_user(ct->u.user.name, t->u.kernel.target->name,
- strlen(t->u.kernel.target->name) + 1))
+ if (XT_OBJ_TO_USER(ct, t, target, tsize))
return -EFAULT;
if (target->compat_to_user) {
if (target->compat_to_user((void __user *)ct->data, t->data))
return -EFAULT;
} else {
- if (copy_to_user(ct->data, t->data, tsize - sizeof(*ct)))
+ if (XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct)))
return -EFAULT;
}
@@ -958,7 +1007,9 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
if (!info) {
- info = vmalloc(sz);
+ info = __vmalloc(sz, GFP_KERNEL | __GFP_NOWARN |
+ __GFP_NORETRY | __GFP_HIGHMEM,
+ PAGE_KERNEL);
if (!info)
return NULL;
}
@@ -982,7 +1033,7 @@ void xt_free_table_info(struct xt_table_info *info)
}
EXPORT_SYMBOL(xt_free_table_info);
-/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */
+/* Find table by name, grabs mutex & ref. Returns NULL on error. */
struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
const char *name)
{
@@ -1615,6 +1666,59 @@ void xt_proto_fini(struct net *net, u_int8_t af)
}
EXPORT_SYMBOL_GPL(xt_proto_fini);
+/**
+ * xt_percpu_counter_alloc - allocate x_tables rule counter
+ *
+ * @state: pointer to xt_percpu allocation state
+ * @counter: pointer to counter struct inside the ip(6)/arpt_entry struct
+ *
+ * On SMP, the packet counter [ ip(6)t_entry->counters.pcnt ] will then
+ * contain the address of the real (percpu) counter.
+ *
+ * Rule evaluation needs to use xt_get_this_cpu_counter() helper
+ * to fetch the real percpu counter.
+ *
+ * To speed up allocation and improve data locality, a 4kb block is
+ * allocated.
+ *
+ * xt_percpu_counter_alloc_state contains the base address of the
+ * allocated page and the current sub-offset.
+ *
+ * returns false on error.
+ */
+bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state,
+ struct xt_counters *counter)
+{
+ BUILD_BUG_ON(XT_PCPU_BLOCK_SIZE < (sizeof(*counter) * 2));
+
+ if (nr_cpu_ids <= 1)
+ return true;
+
+ if (!state->mem) {
+ state->mem = __alloc_percpu(XT_PCPU_BLOCK_SIZE,
+ XT_PCPU_BLOCK_SIZE);
+ if (!state->mem)
+ return false;
+ }
+ counter->pcnt = (__force unsigned long)(state->mem + state->off);
+ state->off += sizeof(*counter);
+ if (state->off > (XT_PCPU_BLOCK_SIZE - sizeof(*counter))) {
+ state->mem = NULL;
+ state->off = 0;
+ }
+ return true;
+}
+EXPORT_SYMBOL_GPL(xt_percpu_counter_alloc);
+
+void xt_percpu_counter_free(struct xt_counters *counters)
+{
+ unsigned long pcnt = counters->pcnt;
+
+ if (nr_cpu_ids > 1 && (pcnt & (XT_PCPU_BLOCK_SIZE - 1)) == 0)
+ free_percpu((void __percpu *)pcnt);
+}
+EXPORT_SYMBOL_GPL(xt_percpu_counter_free);
+
static int __net_init xt_net_init(struct net *net)
{
int i;
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
index 4973cbddc446..19247a17e511 100644
--- a/net/netfilter/xt_AUDIT.c
+++ b/net/netfilter/xt_AUDIT.c
@@ -132,9 +132,9 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
goto errout;
audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s",
- info->type, par->hooknum, skb->len,
- par->in ? par->in->name : "?",
- par->out ? par->out->name : "?");
+ info->type, xt_hooknum(par), skb->len,
+ xt_in(par) ? xt_inname(par) : "?",
+ xt_out(par) ? xt_outname(par) : "?");
if (skb->mark)
audit_log_format(ab, " mark=%#x", skb->mark);
@@ -144,7 +144,7 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
ntohs(eth_hdr(skb)->h_proto));
- if (par->family == NFPROTO_BRIDGE) {
+ if (xt_family(par) == NFPROTO_BRIDGE) {
switch (eth_hdr(skb)->h_proto) {
case htons(ETH_P_IP):
audit_ip4(ab, skb);
@@ -157,7 +157,7 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
}
}
- switch (par->family) {
+ switch (xt_family(par)) {
case NFPROTO_IPV4:
audit_ip4(ab, skb);
break;
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index e04dc282e3bb..da56c06a443c 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -106,7 +106,7 @@ static int connsecmark_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -115,7 +115,7 @@ static int connsecmark_tg_check(const struct xt_tgchk_param *par)
static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_target connsecmark_tg_reg __read_mostly = {
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 6669e68d589e..b008db0184b8 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -23,15 +23,14 @@
static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)
{
/* Previously seen (loopback)? Ignore. */
- if (skb->nfct != NULL)
+ if (skb->_nfct != 0)
return XT_CONTINUE;
/* special case the untracked ct : we want the percpu object */
if (!ct)
ct = nf_ct_untracked_get();
atomic_inc(&ct->ct_general.use);
- skb->nfct = &ct->ct_general;
- skb->nfctinfo = IP_CT_NEW;
+ nf_ct_set(skb, ct, IP_CT_NEW);
return XT_CONTINUE;
}
@@ -216,7 +215,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
goto err1;
#endif
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
goto err1;
@@ -260,7 +259,7 @@ out:
err3:
nf_ct_tmpl_free(ct);
err2:
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
err1:
return ret;
}
@@ -341,7 +340,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par,
if (help)
module_put(help->helper->me);
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
xt_ct_destroy_timeout(ct);
nf_ct_put(info->ct);
@@ -373,6 +372,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.name = "CT",
.family = NFPROTO_UNSPEC,
.targetsize = sizeof(struct xt_ct_target_info),
+ .usersize = offsetof(struct xt_ct_target_info, ct),
.checkentry = xt_ct_tg_check_v0,
.destroy = xt_ct_tg_destroy_v0,
.target = xt_ct_target_v0,
@@ -384,6 +384,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision = 1,
.targetsize = sizeof(struct xt_ct_target_info_v1),
+ .usersize = offsetof(struct xt_ct_target_info, ct),
.checkentry = xt_ct_tg_check_v1,
.destroy = xt_ct_tg_destroy_v1,
.target = xt_ct_target_v1,
@@ -395,6 +396,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision = 2,
.targetsize = sizeof(struct xt_ct_target_info_v1),
+ .usersize = offsetof(struct xt_ct_target_info, ct),
.checkentry = xt_ct_tg_check_v2,
.destroy = xt_ct_tg_destroy_v1,
.target = xt_ct_target_v1,
@@ -407,12 +409,11 @@ static unsigned int
notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
/* Previously seen (loopback)? Ignore. */
- if (skb->nfct != NULL)
+ if (skb->_nfct != 0)
return XT_CONTINUE;
- skb->nfct = &nf_ct_untracked_get()->ct_general;
- skb->nfctinfo = IP_CT_NEW;
- nf_conntrack_get(skb->nfct);
+ nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
+ nf_conntrack_get(skb_nfct(skb));
return XT_CONTINUE;
}
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index 1763ab82bcd7..c3b2017ebe41 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -32,15 +32,15 @@ static unsigned int
log_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_log_info *loginfo = par->targinfo;
+ struct net *net = xt_net(par);
struct nf_loginfo li;
- struct net *net = par->net;
li.type = NF_LOG_TYPE_LOG;
li.u.log.level = loginfo->level;
li.u.log.logflags = loginfo->logflags;
- nf_log_packet(net, par->family, par->hooknum, skb, par->in, par->out,
- &li, "%s", loginfo->prefix);
+ nf_log_packet(net, xt_family(par), xt_hooknum(par), skb, xt_in(par),
+ xt_out(par), &li, "%s", loginfo->prefix);
return XT_CONTINUE;
}
diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c
index b253e07cb1c5..e45a01255e70 100644
--- a/net/netfilter/xt_NETMAP.c
+++ b/net/netfilter/xt_NETMAP.c
@@ -33,8 +33,8 @@ netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
netmask.ip6[i] = ~(range->min_addr.ip6[i] ^
range->max_addr.ip6[i]);
- if (par->hooknum == NF_INET_PRE_ROUTING ||
- par->hooknum == NF_INET_LOCAL_OUT)
+ if (xt_hooknum(par) == NF_INET_PRE_ROUTING ||
+ xt_hooknum(par) == NF_INET_LOCAL_OUT)
new_addr.in6 = ipv6_hdr(skb)->daddr;
else
new_addr.in6 = ipv6_hdr(skb)->saddr;
@@ -51,7 +51,7 @@ netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
newrange.min_proto = range->min_proto;
newrange.max_proto = range->max_proto;
- return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
+ return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(xt_hooknum(par)));
}
static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
@@ -60,7 +60,12 @@ static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
return -EINVAL;
- return 0;
+ return nf_ct_netns_get(par->net, par->family);
+}
+
+static void netmap_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_netns_put(par->net, par->family);
}
static unsigned int
@@ -72,16 +77,16 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par)
const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
struct nf_nat_range newrange;
- NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
- par->hooknum == NF_INET_POST_ROUTING ||
- par->hooknum == NF_INET_LOCAL_OUT ||
- par->hooknum == NF_INET_LOCAL_IN);
+ NF_CT_ASSERT(xt_hooknum(par) == NF_INET_PRE_ROUTING ||
+ xt_hooknum(par) == NF_INET_POST_ROUTING ||
+ xt_hooknum(par) == NF_INET_LOCAL_OUT ||
+ xt_hooknum(par) == NF_INET_LOCAL_IN);
ct = nf_ct_get(skb, &ctinfo);
netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
- if (par->hooknum == NF_INET_PRE_ROUTING ||
- par->hooknum == NF_INET_LOCAL_OUT)
+ if (xt_hooknum(par) == NF_INET_PRE_ROUTING ||
+ xt_hooknum(par) == NF_INET_LOCAL_OUT)
new_ip = ip_hdr(skb)->daddr & ~netmask;
else
new_ip = ip_hdr(skb)->saddr & ~netmask;
@@ -96,7 +101,7 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par)
newrange.max_proto = mr->range[0].max;
/* Hand modified range to generic setup. */
- return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
+ return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(xt_hooknum(par)));
}
static int netmap_tg4_check(const struct xt_tgchk_param *par)
@@ -111,7 +116,7 @@ static int netmap_tg4_check(const struct xt_tgchk_param *par)
pr_debug("bad rangesize %u.\n", mr->rangesize);
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(par->net, par->family);
}
static struct xt_target netmap_tg_reg[] __read_mostly = {
@@ -127,6 +132,7 @@ static struct xt_target netmap_tg_reg[] __read_mostly = {
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_LOCAL_IN),
.checkentry = netmap_tg6_checkentry,
+ .destroy = netmap_tg_destroy,
.me = THIS_MODULE,
},
{
@@ -141,6 +147,7 @@ static struct xt_target netmap_tg_reg[] __read_mostly = {
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_LOCAL_IN),
.checkentry = netmap_tg4_check,
+ .destroy = netmap_tg_destroy,
.me = THIS_MODULE,
},
};
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index 8668a5c18dc3..c7f8958cea4a 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -25,8 +25,8 @@ static unsigned int
nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_nflog_info *info = par->targinfo;
+ struct net *net = xt_net(par);
struct nf_loginfo li;
- struct net *net = par->net;
li.type = NF_LOG_TYPE_ULOG;
li.u.ulog.copy_len = info->len;
@@ -37,8 +37,8 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
if (info->flags & XT_NFLOG_F_COPY_LEN)
li.u.ulog.flags |= NF_LOG_F_COPY_LEN;
- nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in,
- par->out, &li, info->prefix);
+ nfulnl_log_packet(net, xt_family(par), xt_hooknum(par), skb,
+ xt_in(par), xt_out(par), &li, info->prefix);
return XT_CONTINUE;
}
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 8f1779ff7e30..a360b99a958a 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -43,7 +43,7 @@ nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
if (info->queues_total > 1) {
queue = nfqueue_hash(skb, queue, info->queues_total,
- par->family, jhash_initval);
+ xt_family(par), jhash_initval);
}
return NF_QUEUE_NR(queue);
}
@@ -98,7 +98,7 @@ nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par)
queue = info->queuenum + cpu % info->queues_total;
} else {
queue = nfqueue_hash(skb, queue, info->queues_total,
- par->family, jhash_initval);
+ xt_family(par), jhash_initval);
}
}
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index dbd6c4a12b97..498b54fd04d7 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -63,7 +63,7 @@ void xt_rateest_put(struct xt_rateest *est)
mutex_lock(&xt_rateest_mutex);
if (--est->refcnt == 0) {
hlist_del(&est->list);
- gen_kill_estimator(&est->bstats, &est->rstats);
+ gen_kill_estimator(&est->rate_est);
/*
* gen_estimator est_timer() might access est->lock or bstats,
* wait a RCU grace period before freeing 'est'
@@ -132,7 +132,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
cfg.est.interval = info->interval;
cfg.est.ewma_log = info->ewma_log;
- ret = gen_new_estimator(&est->bstats, NULL, &est->rstats,
+ ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est,
&est->lock, NULL, &cfg.opt);
if (ret < 0)
goto err2;
@@ -162,6 +162,7 @@ static struct xt_target xt_rateest_tg_reg __read_mostly = {
.checkentry = xt_rateest_tg_checkentry,
.destroy = xt_rateest_tg_destroy,
.targetsize = sizeof(struct xt_rateest_target_info),
+ .usersize = offsetof(struct xt_rateest_target_info, est),
.me = THIS_MODULE,
};
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
index 03f0b370e178..98a4c6d4f1cb 100644
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -31,7 +31,7 @@
static unsigned int
redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
- return nf_nat_redirect_ipv6(skb, par->targinfo, par->hooknum);
+ return nf_nat_redirect_ipv6(skb, par->targinfo, xt_hooknum(par));
}
static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
@@ -40,7 +40,13 @@ static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
if (range->flags & NF_NAT_RANGE_MAP_IPS)
return -EINVAL;
- return 0;
+
+ return nf_ct_netns_get(par->net, par->family);
+}
+
+static void redirect_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_netns_put(par->net, par->family);
}
/* FIXME: Take multiple ranges --RR */
@@ -56,13 +62,13 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par)
pr_debug("bad rangesize %u.\n", mr->rangesize);
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(par->net, par->family);
}
static unsigned int
redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
- return nf_nat_redirect_ipv4(skb, par->targinfo, par->hooknum);
+ return nf_nat_redirect_ipv4(skb, par->targinfo, xt_hooknum(par));
}
static struct xt_target redirect_tg_reg[] __read_mostly = {
@@ -72,6 +78,7 @@ static struct xt_target redirect_tg_reg[] __read_mostly = {
.revision = 0,
.table = "nat",
.checkentry = redirect_tg6_checkentry,
+ .destroy = redirect_tg_destroy,
.target = redirect_tg6,
.targetsize = sizeof(struct nf_nat_range),
.hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -85,6 +92,7 @@ static struct xt_target redirect_tg_reg[] __read_mostly = {
.table = "nat",
.target = redirect_tg4,
.checkentry = redirect_tg4_check,
+ .destroy = redirect_tg_destroy,
.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_OUT),
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 872db2d0e2a9..c64aca611ac5 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -104,11 +104,11 @@ tcpmss_mangle_packet(struct sk_buff *skb,
tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
tcp_hdrlen = tcph->doff * 4;
- if (len < tcp_hdrlen)
+ if (len < tcp_hdrlen || tcp_hdrlen < sizeof(struct tcphdr))
return -1;
if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
- struct net *net = par->net;
+ struct net *net = xt_net(par);
unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family);
unsigned int min_mtu = min(dst_mtu(skb_dst(skb)), in_mtu);
@@ -152,6 +152,10 @@ tcpmss_mangle_packet(struct sk_buff *skb,
if (len > tcp_hdrlen)
return 0;
+ /* tcph->doff has 4 bits, do not wrap it to 0 */
+ if (tcp_hdrlen >= 15 * 4)
+ return 0;
+
/*
* MSS Option not found ?! add it..
*/
@@ -172,7 +176,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
* length IPv6 header of 60, ergo the default MSS value is 1220
* Since no MSS was provided, we must use the default values
*/
- if (par->family == NFPROTO_IPV4)
+ if (xt_family(par) == NFPROTO_IPV4)
newmss = min(newmss, (u16)536);
else
newmss = min(newmss, (u16)1220);
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 0471db4032c5..86b0580b2216 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -33,7 +33,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
const struct xt_tee_tginfo *info = par->targinfo;
int oif = info->priv ? info->priv->oif : 0;
- nf_dup_ipv4(par->net, skb, par->hooknum, &info->gw.in, oif);
+ nf_dup_ipv4(xt_net(par), skb, xt_hooknum(par), &info->gw.in, oif);
return XT_CONTINUE;
}
@@ -45,7 +45,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
const struct xt_tee_tginfo *info = par->targinfo;
int oif = info->priv ? info->priv->oif : 0;
- nf_dup_ipv6(par->net, skb, par->hooknum, &info->gw.in6, oif);
+ nf_dup_ipv6(xt_net(par), skb, xt_hooknum(par), &info->gw.in6, oif);
return XT_CONTINUE;
}
@@ -133,6 +133,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.target = tee_tg4,
.targetsize = sizeof(struct xt_tee_tginfo),
+ .usersize = offsetof(struct xt_tee_tginfo, priv),
.checkentry = tee_tg_check,
.destroy = tee_tg_destroy,
.me = THIS_MODULE,
@@ -144,6 +145,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.target = tee_tg6,
.targetsize = sizeof(struct xt_tee_tginfo),
+ .usersize = offsetof(struct xt_tee_tginfo, priv),
.checkentry = tee_tg_check,
.destroy = tee_tg_destroy,
.me = THIS_MODULE,
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 663c4c3c9072..df7f1df00330 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -364,7 +364,8 @@ tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tproxy_target_info *tgi = par->targinfo;
- return tproxy_tg4(par->net, skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
+ return tproxy_tg4(xt_net(par), skb, tgi->laddr, tgi->lport,
+ tgi->mark_mask, tgi->mark_value);
}
static unsigned int
@@ -372,7 +373,8 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
- return tproxy_tg4(par->net, skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
+ return tproxy_tg4(xt_net(par), skb, tgi->laddr.ip, tgi->lport,
+ tgi->mark_mask, tgi->mark_value);
}
#ifdef XT_TPROXY_HAVE_IPV6
@@ -391,7 +393,8 @@ tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
rcu_read_lock();
indev = __in6_dev_get(skb->dev);
- if (indev)
+ if (indev) {
+ read_lock_bh(&indev->lock);
list_for_each_entry(ifa, &indev->addr_list, if_list) {
if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
continue;
@@ -399,6 +402,8 @@ tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
laddr = &ifa->addr;
break;
}
+ read_unlock_bh(&indev->lock);
+ }
rcu_read_unlock();
return laddr ? laddr : daddr;
@@ -442,7 +447,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
* to a listener socket if there's one */
struct sock *sk2;
- sk2 = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto,
+ sk2 = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
&iph->saddr,
tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
hp->source,
@@ -485,10 +490,10 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
* addresses, this happens if the redirect already happened
* and the current packet belongs to an already established
* connection */
- sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto,
+ sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
&iph->saddr, &iph->daddr,
hp->source, hp->dest,
- par->in, NFT_LOOKUP_ESTABLISHED);
+ xt_in(par), NFT_LOOKUP_ESTABLISHED);
laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
lport = tgi->lport ? tgi->lport : hp->dest;
@@ -500,10 +505,10 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
else if (!sk)
/* no there's no established connection, check if
* there's a listener on the redirected addr/port */
- sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp,
+ sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp,
tproto, &iph->saddr, laddr,
hp->source, lport,
- par->in, NFT_LOOKUP_LISTENER);
+ xt_in(par), NFT_LOOKUP_LISTENER);
/* NOTE: assign_sock consumes our sk reference */
if (sk && tproxy_sk_is_transparent(sk)) {
@@ -529,6 +534,11 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
static int tproxy_tg6_check(const struct xt_tgchk_param *par)
{
const struct ip6t_ip6 *i = par->entryinfo;
+ int err;
+
+ err = nf_defrag_ipv6_enable(par->net);
+ if (err)
+ return err;
if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) &&
!(i->invflags & IP6T_INV_PROTO))
@@ -543,6 +553,11 @@ static int tproxy_tg6_check(const struct xt_tgchk_param *par)
static int tproxy_tg4_check(const struct xt_tgchk_param *par)
{
const struct ipt_ip *i = par->entryinfo;
+ int err;
+
+ err = nf_defrag_ipv4_enable(par->net);
+ if (err)
+ return err;
if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
&& !(i->invflags & IPT_INV_PROTO))
@@ -594,11 +609,6 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = {
static int __init tproxy_tg_init(void)
{
- nf_defrag_ipv4_enable();
-#ifdef XT_TPROXY_HAVE_IPV6
- nf_defrag_ipv6_enable();
-#endif
-
return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
}
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
index 11d6091991a4..e329dabde35f 100644
--- a/net/netfilter/xt_addrtype.c
+++ b/net/netfilter/xt_addrtype.c
@@ -125,7 +125,7 @@ static inline bool match_type(struct net *net, const struct net_device *dev,
static bool
addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct net *net = par->net;
+ struct net *net = xt_net(par);
const struct xt_addrtype_info *info = par->matchinfo;
const struct iphdr *iph = ip_hdr(skb);
bool ret = true;
@@ -143,19 +143,19 @@ addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
static bool
addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct net *net = par->net;
+ struct net *net = xt_net(par);
const struct xt_addrtype_info_v1 *info = par->matchinfo;
const struct iphdr *iph;
const struct net_device *dev = NULL;
bool ret = true;
if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN)
- dev = par->in;
+ dev = xt_in(par);
else if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT)
- dev = par->out;
+ dev = xt_out(par);
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
- if (par->family == NFPROTO_IPV6)
+ if (xt_family(par) == NFPROTO_IPV6)
return addrtype_mt6(net, dev, skb, info);
#endif
iph = ip_hdr(skb);
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index dffee9d47ec4..38986a95216c 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/filter.h>
+#include <linux/bpf.h>
#include <linux/netfilter/xt_bpf.h>
#include <linux/netfilter/x_tables.h>
@@ -20,15 +21,15 @@ MODULE_LICENSE("GPL");
MODULE_ALIAS("ipt_bpf");
MODULE_ALIAS("ip6t_bpf");
-static int bpf_mt_check(const struct xt_mtchk_param *par)
+static int __bpf_mt_check_bytecode(struct sock_filter *insns, __u16 len,
+ struct bpf_prog **ret)
{
- struct xt_bpf_info *info = par->matchinfo;
struct sock_fprog_kern program;
- program.len = info->bpf_program_num_elem;
- program.filter = info->bpf_program;
+ program.len = len;
+ program.filter = insns;
- if (bpf_prog_create(&info->filter, &program)) {
+ if (bpf_prog_create(ret, &program)) {
pr_info("bpf: check failed: parse error\n");
return -EINVAL;
}
@@ -36,6 +37,42 @@ static int bpf_mt_check(const struct xt_mtchk_param *par)
return 0;
}
+static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret)
+{
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ *ret = prog;
+ return 0;
+}
+
+static int bpf_mt_check(const struct xt_mtchk_param *par)
+{
+ struct xt_bpf_info *info = par->matchinfo;
+
+ return __bpf_mt_check_bytecode(info->bpf_program,
+ info->bpf_program_num_elem,
+ &info->filter);
+}
+
+static int bpf_mt_check_v1(const struct xt_mtchk_param *par)
+{
+ struct xt_bpf_info_v1 *info = par->matchinfo;
+
+ if (info->mode == XT_BPF_MODE_BYTECODE)
+ return __bpf_mt_check_bytecode(info->bpf_program,
+ info->bpf_program_num_elem,
+ &info->filter);
+ else if (info->mode == XT_BPF_MODE_FD_PINNED ||
+ info->mode == XT_BPF_MODE_FD_ELF)
+ return __bpf_mt_check_fd(info->fd, &info->filter);
+ else
+ return -EINVAL;
+}
+
static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_bpf_info *info = par->matchinfo;
@@ -43,31 +80,60 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
return BPF_PROG_RUN(info->filter, skb);
}
+static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_bpf_info_v1 *info = par->matchinfo;
+
+ return !!bpf_prog_run_save_cb(info->filter, (struct sk_buff *) skb);
+}
+
static void bpf_mt_destroy(const struct xt_mtdtor_param *par)
{
const struct xt_bpf_info *info = par->matchinfo;
+
+ bpf_prog_destroy(info->filter);
+}
+
+static void bpf_mt_destroy_v1(const struct xt_mtdtor_param *par)
+{
+ const struct xt_bpf_info_v1 *info = par->matchinfo;
+
bpf_prog_destroy(info->filter);
}
-static struct xt_match bpf_mt_reg __read_mostly = {
- .name = "bpf",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = bpf_mt_check,
- .match = bpf_mt,
- .destroy = bpf_mt_destroy,
- .matchsize = sizeof(struct xt_bpf_info),
- .me = THIS_MODULE,
+static struct xt_match bpf_mt_reg[] __read_mostly = {
+ {
+ .name = "bpf",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = bpf_mt_check,
+ .match = bpf_mt,
+ .destroy = bpf_mt_destroy,
+ .matchsize = sizeof(struct xt_bpf_info),
+ .usersize = offsetof(struct xt_bpf_info, filter),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "bpf",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = bpf_mt_check_v1,
+ .match = bpf_mt_v1,
+ .destroy = bpf_mt_destroy_v1,
+ .matchsize = sizeof(struct xt_bpf_info_v1),
+ .usersize = offsetof(struct xt_bpf_info_v1, filter),
+ .me = THIS_MODULE,
+ },
};
static int __init bpf_mt_init(void)
{
- return xt_register_match(&bpf_mt_reg);
+ return xt_register_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg));
}
static void __exit bpf_mt_exit(void)
{
- xt_unregister_match(&bpf_mt_reg);
+ xt_unregister_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg));
}
module_init(bpf_mt_init);
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index a086a914865f..1db1ce59079f 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -122,6 +122,7 @@ static struct xt_match cgroup_mt_reg[] __read_mostly = {
.checkentry = cgroup_mt_check_v1,
.match = cgroup_mt_v1,
.matchsize = sizeof(struct xt_cgroup_info_v1),
+ .usersize = offsetof(struct xt_cgroup_info_v1, priv),
.destroy = cgroup_mt_destroy_v1,
.me = THIS_MODULE,
.hooks = (1 << NF_INET_LOCAL_OUT) |
diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
index 96fa26b20b67..9a9884a39c0e 100644
--- a/net/netfilter/xt_cluster.c
+++ b/net/netfilter/xt_cluster.c
@@ -112,7 +112,7 @@ xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par)
* know, matches should not alter packets, but we are doing this here
* because we would need to add a PKTTYPE target for this sole purpose.
*/
- if (!xt_cluster_is_multicast_addr(skb, par->family) &&
+ if (!xt_cluster_is_multicast_addr(skb, xt_family(par)) &&
skb->pkt_type == PACKET_MULTICAST) {
pskb->pkt_type = PACKET_HOST;
}
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index d4bec261e74e..cad0b7b5eb35 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -110,7 +110,7 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par)
sinfo->direction != XT_CONNBYTES_DIR_BOTH)
return -EINVAL;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -129,7 +129,7 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par)
static void connbytes_mt_destroy(const struct xt_mtdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_match connbytes_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c
index 03d66f1c5e69..7827128d5a95 100644
--- a/net/netfilter/xt_connlabel.c
+++ b/net/netfilter/xt_connlabel.c
@@ -61,7 +61,7 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
return -EINVAL;
}
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0) {
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -70,14 +70,14 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
ret = nf_connlabels_get(par->net, info->bit);
if (ret < 0)
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
return ret;
}
static void connlabel_mt_destroy(const struct xt_mtdtor_param *par)
{
nf_connlabels_put(par->net);
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_match connlabels_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index b6dc322593a3..b8fd4ab762ed 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -218,7 +218,7 @@ count_tree(struct net *net, struct rb_root *root,
int diff;
bool addit;
- rbconn = container_of(*rbnode, struct xt_connlimit_rb, node);
+ rbconn = rb_entry(*rbnode, struct xt_connlimit_rb, node);
parent = *rbnode;
diff = same_source_net(addr, mask, &rbconn->addr, family);
@@ -317,7 +317,7 @@ static int count_them(struct net *net,
static bool
connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct net *net = par->net;
+ struct net *net = xt_net(par);
const struct xt_connlimit_info *info = par->matchinfo;
union nf_inet_addr addr;
struct nf_conntrack_tuple tuple;
@@ -332,11 +332,11 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
zone = nf_ct_zone(ct);
} else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
- par->family, net, &tuple)) {
+ xt_family(par), net, &tuple)) {
goto hotdrop;
}
- if (par->family == NFPROTO_IPV6) {
+ if (xt_family(par) == NFPROTO_IPV6) {
const struct ipv6hdr *iph = ipv6_hdr(skb);
memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ?
&iph->daddr : &iph->saddr, sizeof(addr.ip6));
@@ -347,7 +347,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
}
connections = count_them(net, info->data, tuple_ptr, &addr,
- &info->mask, par->family, zone);
+ &info->mask, xt_family(par), zone);
if (connections == 0)
/* kmalloc failed, drop it entirely */
goto hotdrop;
@@ -368,7 +368,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
net_get_random_once(&connlimit_rnd, sizeof(connlimit_rnd));
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0) {
pr_info("cannot load conntrack support for "
"address family %u\n", par->family);
@@ -378,7 +378,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
/* init private data */
info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL);
if (info->data == NULL) {
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
return -ENOMEM;
}
@@ -398,7 +398,7 @@ static void destroy_tree(struct rb_root *r)
struct rb_node *node;
while ((node = rb_first(r)) != NULL) {
- rbconn = container_of(node, struct xt_connlimit_rb, node);
+ rbconn = rb_entry(node, struct xt_connlimit_rb, node);
rb_erase(node, r);
@@ -414,7 +414,7 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
const struct xt_connlimit_info *info = par->matchinfo;
unsigned int i;
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i)
destroy_tree(&info->data->climit_root4[i]);
@@ -431,6 +431,7 @@ static struct xt_match connlimit_mt_reg __read_mostly = {
.checkentry = connlimit_mt_check,
.match = connlimit_mt,
.matchsize = sizeof(struct xt_connlimit_info),
+ .usersize = offsetof(struct xt_connlimit_info, data),
.destroy = connlimit_mt_destroy,
.me = THIS_MODULE,
};
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index b83e158e116a..9935d5029b0e 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -77,7 +77,7 @@ static int connmark_tg_check(const struct xt_tgchk_param *par)
{
int ret;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -86,7 +86,7 @@ static int connmark_tg_check(const struct xt_tgchk_param *par)
static void connmark_tg_destroy(const struct xt_tgdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static bool
@@ -107,7 +107,7 @@ static int connmark_mt_check(const struct xt_mtchk_param *par)
{
int ret;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -116,7 +116,7 @@ static int connmark_mt_check(const struct xt_mtchk_param *par)
static void connmark_mt_destroy(const struct xt_mtdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_target connmark_tg_reg __read_mostly = {
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index a3b8f697cfc5..c0fb217bc649 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -200,22 +200,22 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
return false;
if (info->match_flags & XT_CONNTRACK_ORIGSRC)
- if (conntrack_mt_origsrc(ct, info, par->family) ^
+ if (conntrack_mt_origsrc(ct, info, xt_family(par)) ^
!(info->invert_flags & XT_CONNTRACK_ORIGSRC))
return false;
if (info->match_flags & XT_CONNTRACK_ORIGDST)
- if (conntrack_mt_origdst(ct, info, par->family) ^
+ if (conntrack_mt_origdst(ct, info, xt_family(par)) ^
!(info->invert_flags & XT_CONNTRACK_ORIGDST))
return false;
if (info->match_flags & XT_CONNTRACK_REPLSRC)
- if (conntrack_mt_replsrc(ct, info, par->family) ^
+ if (conntrack_mt_replsrc(ct, info, xt_family(par)) ^
!(info->invert_flags & XT_CONNTRACK_REPLSRC))
return false;
if (info->match_flags & XT_CONNTRACK_REPLDST)
- if (conntrack_mt_repldst(ct, info, par->family) ^
+ if (conntrack_mt_repldst(ct, info, xt_family(par)) ^
!(info->invert_flags & XT_CONNTRACK_REPLDST))
return false;
@@ -271,7 +271,7 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par)
{
int ret;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -280,7 +280,7 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par)
static void conntrack_mt_destroy(const struct xt_mtdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_match conntrack_mt_reg[] __read_mostly = {
diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c
index d9202cdd25c9..96ebe1cdefec 100644
--- a/net/netfilter/xt_devgroup.c
+++ b/net/netfilter/xt_devgroup.c
@@ -24,12 +24,12 @@ static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
const struct xt_devgroup_info *info = par->matchinfo;
if (info->flags & XT_DEVGROUP_MATCH_SRC &&
- (((info->src_group ^ par->in->group) & info->src_mask ? 1 : 0) ^
+ (((info->src_group ^ xt_in(par)->group) & info->src_mask ? 1 : 0) ^
((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0)))
return false;
if (info->flags & XT_DEVGROUP_MATCH_DST &&
- (((info->dst_group ^ par->out->group) & info->dst_mask ? 1 : 0) ^
+ (((info->dst_group ^ xt_out(par)->group) & info->dst_mask ? 1 : 0) ^
((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0)))
return false;
diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c
index 64670fc5d0e1..236ac8008909 100644
--- a/net/netfilter/xt_dscp.c
+++ b/net/netfilter/xt_dscp.c
@@ -58,7 +58,7 @@ static bool tos_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_tos_match_info *info = par->matchinfo;
- if (par->family == NFPROTO_IPV4)
+ if (xt_family(par) == NFPROTO_IPV4)
return ((ip_hdr(skb)->tos & info->tos_mask) ==
info->tos_value) ^ !!info->invert;
else
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index b89b688e9d01..2a6dfe8b74d3 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -49,7 +49,7 @@ struct hashlimit_net {
struct proc_dir_entry *ip6t_hashlimit;
};
-static int hashlimit_net_id;
+static unsigned int hashlimit_net_id;
static inline struct hashlimit_net *hashlimit_pernet(struct net *net)
{
return net_generic(net, hashlimit_net_id);
@@ -463,23 +463,16 @@ static u32 xt_hashlimit_len_to_chunks(u32 len)
/* Precision saver. */
static u64 user2credits(u64 user, int revision)
{
- if (revision == 1) {
- /* If multiplying would overflow... */
- if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1))
- /* Divide first. */
- return div64_u64(user, XT_HASHLIMIT_SCALE)
- * HZ * CREDITS_PER_JIFFY_v1;
-
- return div64_u64(user * HZ * CREDITS_PER_JIFFY_v1,
- XT_HASHLIMIT_SCALE);
- } else {
- if (user > 0xFFFFFFFFFFFFFFFFULL / (HZ*CREDITS_PER_JIFFY))
- return div64_u64(user, XT_HASHLIMIT_SCALE_v2)
- * HZ * CREDITS_PER_JIFFY;
+ u64 scale = (revision == 1) ?
+ XT_HASHLIMIT_SCALE : XT_HASHLIMIT_SCALE_v2;
+ u64 cpj = (revision == 1) ?
+ CREDITS_PER_JIFFY_v1 : CREDITS_PER_JIFFY;
- return div64_u64(user * HZ * CREDITS_PER_JIFFY,
- XT_HASHLIMIT_SCALE_v2);
- }
+ /* Avoid overflow: divide the constant operands first */
+ if (scale >= HZ * cpj)
+ return div64_u64(user, div64_u64(scale, HZ * cpj));
+
+ return user * div64_u64(HZ * cpj, scale);
}
static u32 user2credits_byte(u32 user)
@@ -838,6 +831,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.match = hashlimit_mt_v1,
.matchsize = sizeof(struct xt_hashlimit_mtinfo1),
+ .usersize = offsetof(struct xt_hashlimit_mtinfo1, hinfo),
.checkentry = hashlimit_mt_check_v1,
.destroy = hashlimit_mt_destroy_v1,
.me = THIS_MODULE,
@@ -848,6 +842,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.match = hashlimit_mt,
.matchsize = sizeof(struct xt_hashlimit_mtinfo2),
+ .usersize = offsetof(struct xt_hashlimit_mtinfo2, hinfo),
.checkentry = hashlimit_mt_check,
.destroy = hashlimit_mt_destroy,
.me = THIS_MODULE,
@@ -859,6 +854,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.match = hashlimit_mt_v1,
.matchsize = sizeof(struct xt_hashlimit_mtinfo1),
+ .usersize = offsetof(struct xt_hashlimit_mtinfo1, hinfo),
.checkentry = hashlimit_mt_check_v1,
.destroy = hashlimit_mt_destroy_v1,
.me = THIS_MODULE,
@@ -869,6 +865,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.match = hashlimit_mt,
.matchsize = sizeof(struct xt_hashlimit_mtinfo2),
+ .usersize = offsetof(struct xt_hashlimit_mtinfo2, hinfo),
.checkentry = hashlimit_mt_check,
.destroy = hashlimit_mt_destroy,
.me = THIS_MODULE,
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index f679dd4c272a..38a78151c0e9 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -59,7 +59,7 @@ static int helper_mt_check(const struct xt_mtchk_param *par)
struct xt_helper_info *info = par->matchinfo;
int ret;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0) {
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -71,7 +71,7 @@ static int helper_mt_check(const struct xt_mtchk_param *par)
static void helper_mt_destroy(const struct xt_mtdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_match helper_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
index 71a9d95e0a81..0fdc89064488 100644
--- a/net/netfilter/xt_ipvs.c
+++ b/net/netfilter/xt_ipvs.c
@@ -48,9 +48,9 @@ static bool
ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_ipvs_mtinfo *data = par->matchinfo;
- struct netns_ipvs *ipvs = net_ipvs(par->net);
+ struct netns_ipvs *ipvs = net_ipvs(xt_net(par));
/* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */
- const u_int8_t family = par->family;
+ const u_int8_t family = xt_family(par);
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_conn *cp;
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index bef850596558..dab962df1787 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -192,6 +192,8 @@ static struct xt_match limit_mt_reg __read_mostly = {
.compatsize = sizeof(struct compat_xt_rateinfo),
.compat_from_user = limit_mt_compat_from_user,
.compat_to_user = limit_mt_compat_to_user,
+#else
+ .usersize = offsetof(struct xt_rateinfo, prev),
#endif
.me = THIS_MODULE,
};
diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
index ac1d3c3d09e7..1cde0e4985b7 100644
--- a/net/netfilter/xt_multiport.c
+++ b/net/netfilter/xt_multiport.c
@@ -42,29 +42,43 @@ ports_match_v1(const struct xt_multiport_v1 *minfo,
e = minfo->ports[++i];
pr_debug("src or dst matches with %d-%d?\n", s, e);
- if (minfo->flags == XT_MULTIPORT_SOURCE
- && src >= s && src <= e)
- return true ^ minfo->invert;
- if (minfo->flags == XT_MULTIPORT_DESTINATION
- && dst >= s && dst <= e)
- return true ^ minfo->invert;
- if (minfo->flags == XT_MULTIPORT_EITHER
- && ((dst >= s && dst <= e)
- || (src >= s && src <= e)))
- return true ^ minfo->invert;
+ switch (minfo->flags) {
+ case XT_MULTIPORT_SOURCE:
+ if (src >= s && src <= e)
+ return true ^ minfo->invert;
+ break;
+ case XT_MULTIPORT_DESTINATION:
+ if (dst >= s && dst <= e)
+ return true ^ minfo->invert;
+ break;
+ case XT_MULTIPORT_EITHER:
+ if ((dst >= s && dst <= e) ||
+ (src >= s && src <= e))
+ return true ^ minfo->invert;
+ break;
+ default:
+ break;
+ }
} else {
/* exact port matching */
pr_debug("src or dst matches with %d?\n", s);
- if (minfo->flags == XT_MULTIPORT_SOURCE
- && src == s)
- return true ^ minfo->invert;
- if (minfo->flags == XT_MULTIPORT_DESTINATION
- && dst == s)
- return true ^ minfo->invert;
- if (minfo->flags == XT_MULTIPORT_EITHER
- && (src == s || dst == s))
- return true ^ minfo->invert;
+ switch (minfo->flags) {
+ case XT_MULTIPORT_SOURCE:
+ if (src == s)
+ return true ^ minfo->invert;
+ break;
+ case XT_MULTIPORT_DESTINATION:
+ if (dst == s)
+ return true ^ minfo->invert;
+ break;
+ case XT_MULTIPORT_EITHER:
+ if (src == s || dst == s)
+ return true ^ minfo->invert;
+ break;
+ default:
+ break;
+ }
}
}
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
index bea7464cc43f..8107b3eb865f 100644
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -23,7 +23,17 @@ static int xt_nat_checkentry_v0(const struct xt_tgchk_param *par)
par->target->name);
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(par->net, par->family);
+}
+
+static int xt_nat_checkentry(const struct xt_tgchk_param *par)
+{
+ return nf_ct_netns_get(par->net, par->family);
+}
+
+static void xt_nat_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_netns_put(par->net, par->family);
}
static void xt_nat_convert_range(struct nf_nat_range *dst,
@@ -106,6 +116,7 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
.name = "SNAT",
.revision = 0,
.checkentry = xt_nat_checkentry_v0,
+ .destroy = xt_nat_destroy,
.target = xt_snat_target_v0,
.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
.family = NFPROTO_IPV4,
@@ -118,6 +129,7 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
.name = "DNAT",
.revision = 0,
.checkentry = xt_nat_checkentry_v0,
+ .destroy = xt_nat_destroy,
.target = xt_dnat_target_v0,
.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
.family = NFPROTO_IPV4,
@@ -129,6 +141,8 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
{
.name = "SNAT",
.revision = 1,
+ .checkentry = xt_nat_checkentry,
+ .destroy = xt_nat_destroy,
.target = xt_snat_target_v1,
.targetsize = sizeof(struct nf_nat_range),
.table = "nat",
@@ -139,6 +153,8 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
{
.name = "DNAT",
.revision = 1,
+ .checkentry = xt_nat_checkentry,
+ .destroy = xt_nat_destroy,
.target = xt_dnat_target_v1,
.targetsize = sizeof(struct nf_nat_range),
.table = "nat",
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index cf327593852a..cc0518fe598e 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -26,7 +26,7 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par)
nfnl_acct_update(skb, info->nfacct);
- overquota = nfnl_acct_overquota(par->net, skb, info->nfacct);
+ overquota = nfnl_acct_overquota(xt_net(par), skb, info->nfacct);
return overquota == NFACCT_UNDERQUOTA ? false : true;
}
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index 2455b69b5810..c05fefcec238 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -201,7 +201,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
unsigned char opts[MAX_IPOPTLEN];
const struct xt_osf_finger *kf;
const struct xt_osf_user_finger *f;
- struct net *net = p->net;
+ struct net *net = xt_net(p);
if (!info)
return false;
@@ -326,8 +326,8 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
fcount++;
if (info->flags & XT_OSF_LOG)
- nf_log_packet(net, p->family, p->hooknum, skb,
- p->in, p->out, NULL,
+ nf_log_packet(net, xt_family(p), xt_hooknum(p), skb,
+ xt_in(p), xt_out(p), NULL,
"%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
f->genre, f->version, f->subtype,
&ip->saddr, ntohs(tcp->source),
@@ -341,8 +341,8 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
rcu_read_unlock();
if (!fcount && (info->flags & XT_OSF_LOG))
- nf_log_packet(net, p->family, p->hooknum, skb, p->in,
- p->out, NULL,
+ nf_log_packet(net, xt_family(p), xt_hooknum(p), skb, xt_in(p),
+ xt_out(p), NULL,
"Remote OS is not known: %pI4:%u -> %pI4:%u\n",
&ip->saddr, ntohs(tcp->source),
&ip->daddr, ntohs(tcp->dest));
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index a20e731b5b6c..3d705c688a27 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -13,6 +13,8 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/file.h>
+#include <linux/cred.h>
+
#include <net/sock.h>
#include <net/inet_sock.h>
#include <linux/netfilter/x_tables.h>
@@ -63,7 +65,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
const struct xt_owner_match_info *info = par->matchinfo;
const struct file *filp;
struct sock *sk = skb_to_full_sk(skb);
- struct net *net = par->net;
+ struct net *net = xt_net(par);
if (sk == NULL || sk->sk_socket == NULL)
return (info->match ^ info->invert) == 0;
diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c
index 5b645cb598fc..1ef99151b3ba 100644
--- a/net/netfilter/xt_pkttype.c
+++ b/net/netfilter/xt_pkttype.c
@@ -30,11 +30,10 @@ pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (skb->pkt_type != PACKET_LOOPBACK)
type = skb->pkt_type;
- else if (par->family == NFPROTO_IPV4 &&
+ else if (xt_family(par) == NFPROTO_IPV4 &&
ipv4_is_multicast(ip_hdr(skb)->daddr))
type = PACKET_MULTICAST;
- else if (par->family == NFPROTO_IPV6 &&
- ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF)
+ else if (xt_family(par) == NFPROTO_IPV6)
type = PACKET_MULTICAST;
else
type = PACKET_BROADCAST;
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index f23e97bb42d7..2b4ab189bba7 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -116,9 +116,9 @@ policy_mt(const struct sk_buff *skb, struct xt_action_param *par)
int ret;
if (info->flags & XT_POLICY_MATCH_IN)
- ret = match_policy_in(skb, info, par->family);
+ ret = match_policy_in(skb, info, xt_family(par));
else
- ret = match_policy_out(skb, info, par->family);
+ ret = match_policy_out(skb, info, xt_family(par));
if (ret < 0)
ret = info->flags & XT_POLICY_MATCH_NONE ? true : false;
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index 44c8eb4c9d66..10d61a6eed71 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -73,6 +73,7 @@ static struct xt_match quota_mt_reg __read_mostly = {
.checkentry = quota_mt_check,
.destroy = quota_mt_destroy,
.matchsize = sizeof(struct xt_quota_info),
+ .usersize = offsetof(struct xt_quota_info, master),
.me = THIS_MODULE,
};
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index 7720b036d76a..755d2f6693a2 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -18,35 +18,33 @@ static bool
xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_rateest_match_info *info = par->matchinfo;
- struct gnet_stats_rate_est64 *r;
+ struct gnet_stats_rate_est64 sample = {0};
u_int32_t bps1, bps2, pps1, pps2;
bool ret = true;
- spin_lock_bh(&info->est1->lock);
- r = &info->est1->rstats;
+ gen_estimator_read(&info->est1->rate_est, &sample);
+
if (info->flags & XT_RATEEST_MATCH_DELTA) {
- bps1 = info->bps1 >= r->bps ? info->bps1 - r->bps : 0;
- pps1 = info->pps1 >= r->pps ? info->pps1 - r->pps : 0;
+ bps1 = info->bps1 >= sample.bps ? info->bps1 - sample.bps : 0;
+ pps1 = info->pps1 >= sample.pps ? info->pps1 - sample.pps : 0;
} else {
- bps1 = r->bps;
- pps1 = r->pps;
+ bps1 = sample.bps;
+ pps1 = sample.pps;
}
- spin_unlock_bh(&info->est1->lock);
if (info->flags & XT_RATEEST_MATCH_ABS) {
bps2 = info->bps2;
pps2 = info->pps2;
} else {
- spin_lock_bh(&info->est2->lock);
- r = &info->est2->rstats;
+ gen_estimator_read(&info->est2->rate_est, &sample);
+
if (info->flags & XT_RATEEST_MATCH_DELTA) {
- bps2 = info->bps2 >= r->bps ? info->bps2 - r->bps : 0;
- pps2 = info->pps2 >= r->pps ? info->pps2 - r->pps : 0;
+ bps2 = info->bps2 >= sample.bps ? info->bps2 - sample.bps : 0;
+ pps2 = info->pps2 >= sample.pps ? info->pps2 - sample.pps : 0;
} else {
- bps2 = r->bps;
- pps2 = r->pps;
+ bps2 = sample.bps;
+ pps2 = sample.pps;
}
- spin_unlock_bh(&info->est2->lock);
}
switch (info->mode) {
@@ -135,6 +133,7 @@ static struct xt_match xt_rateest_mt_reg __read_mostly = {
.checkentry = xt_rateest_mt_checkentry,
.destroy = xt_rateest_mt_destroy,
.matchsize = sizeof(struct xt_rateest_match_info),
+ .usersize = offsetof(struct xt_rateest_match_info, est1),
.me = THIS_MODULE,
};
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index e3b7a09b103e..1d89a4eaf841 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -95,7 +95,7 @@ struct recent_net {
#endif
};
-static int recent_net_id __read_mostly;
+static unsigned int recent_net_id __read_mostly;
static inline struct recent_net *recent_pernet(struct net *net)
{
@@ -236,7 +236,7 @@ static void recent_table_flush(struct recent_table *t)
static bool
recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct net *net = par->net;
+ struct net *net = xt_net(par);
struct recent_net *recent_net = recent_pernet(net);
const struct xt_recent_mtinfo_v1 *info = par->matchinfo;
struct recent_table *t;
@@ -245,7 +245,7 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
u_int8_t ttl;
bool ret = info->invert;
- if (par->family == NFPROTO_IPV4) {
+ if (xt_family(par) == NFPROTO_IPV4) {
const struct iphdr *iph = ip_hdr(skb);
if (info->side == XT_RECENT_DEST)
@@ -266,7 +266,7 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
}
/* use TTL as seen before forwarding */
- if (par->out != NULL && skb->sk == NULL)
+ if (xt_out(par) != NULL && skb->sk == NULL)
ttl++;
spin_lock_bh(&recent_lock);
@@ -274,12 +274,12 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
nf_inet_addr_mask(&addr, &addr_mask, &t->mask);
- e = recent_entry_lookup(t, &addr_mask, par->family,
+ e = recent_entry_lookup(t, &addr_mask, xt_family(par),
(info->check_set & XT_RECENT_TTL) ? ttl : 0);
if (e == NULL) {
if (!(info->check_set & XT_RECENT_SET))
goto out;
- e = recent_entry_init(t, &addr_mask, par->family, ttl);
+ e = recent_entry_init(t, &addr_mask, xt_family(par), ttl);
if (e == NULL)
par->hotdrop = true;
ret = !ret;
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 5669e5b453f4..64285702afd5 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -55,7 +55,7 @@ set_match_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_set_info_match_v0 *info = par->matchinfo;
- ADT_OPT(opt, par->family, info->match_set.u.compat.dim,
+ ADT_OPT(opt, xt_family(par), info->match_set.u.compat.dim,
info->match_set.u.compat.flags, 0, UINT_MAX);
return match_set(info->match_set.index, skb, par, &opt,
@@ -118,7 +118,7 @@ set_match_v1(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_set_info_match_v1 *info = par->matchinfo;
- ADT_OPT(opt, par->family, info->match_set.dim,
+ ADT_OPT(opt, xt_family(par), info->match_set.dim,
info->match_set.flags, 0, UINT_MAX);
if (opt.flags & IPSET_RETURN_NOMATCH)
@@ -184,7 +184,7 @@ set_match_v3(const struct sk_buff *skb, struct xt_action_param *par)
const struct xt_set_info_match_v3 *info = par->matchinfo;
int ret;
- ADT_OPT(opt, par->family, info->match_set.dim,
+ ADT_OPT(opt, xt_family(par), info->match_set.dim,
info->match_set.flags, info->flags, UINT_MAX);
if (info->packets.op != IPSET_COUNTER_NONE ||
@@ -231,7 +231,7 @@ set_match_v4(const struct sk_buff *skb, struct xt_action_param *par)
const struct xt_set_info_match_v4 *info = par->matchinfo;
int ret;
- ADT_OPT(opt, par->family, info->match_set.dim,
+ ADT_OPT(opt, xt_family(par), info->match_set.dim,
info->match_set.flags, info->flags, UINT_MAX);
if (info->packets.op != IPSET_COUNTER_NONE ||
@@ -259,9 +259,9 @@ set_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_set_info_target_v0 *info = par->targinfo;
- ADT_OPT(add_opt, par->family, info->add_set.u.compat.dim,
+ ADT_OPT(add_opt, xt_family(par), info->add_set.u.compat.dim,
info->add_set.u.compat.flags, 0, UINT_MAX);
- ADT_OPT(del_opt, par->family, info->del_set.u.compat.dim,
+ ADT_OPT(del_opt, xt_family(par), info->del_set.u.compat.dim,
info->del_set.u.compat.flags, 0, UINT_MAX);
if (info->add_set.index != IPSET_INVALID_ID)
@@ -332,9 +332,9 @@ set_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_set_info_target_v1 *info = par->targinfo;
- ADT_OPT(add_opt, par->family, info->add_set.dim,
+ ADT_OPT(add_opt, xt_family(par), info->add_set.dim,
info->add_set.flags, 0, UINT_MAX);
- ADT_OPT(del_opt, par->family, info->del_set.dim,
+ ADT_OPT(del_opt, xt_family(par), info->del_set.dim,
info->del_set.flags, 0, UINT_MAX);
if (info->add_set.index != IPSET_INVALID_ID)
@@ -401,9 +401,9 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_set_info_target_v2 *info = par->targinfo;
- ADT_OPT(add_opt, par->family, info->add_set.dim,
+ ADT_OPT(add_opt, xt_family(par), info->add_set.dim,
info->add_set.flags, info->flags, info->timeout);
- ADT_OPT(del_opt, par->family, info->del_set.dim,
+ ADT_OPT(del_opt, xt_family(par), info->del_set.dim,
info->del_set.flags, 0, UINT_MAX);
/* Normalize to fit into jiffies */
@@ -423,17 +423,19 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
/* Revision 3 target */
+#define MOPT(opt, member) ((opt).ext.skbinfo.member)
+
static unsigned int
set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_set_info_target_v3 *info = par->targinfo;
int ret;
- ADT_OPT(add_opt, par->family, info->add_set.dim,
+ ADT_OPT(add_opt, xt_family(par), info->add_set.dim,
info->add_set.flags, info->flags, info->timeout);
- ADT_OPT(del_opt, par->family, info->del_set.dim,
+ ADT_OPT(del_opt, xt_family(par), info->del_set.dim,
info->del_set.flags, 0, UINT_MAX);
- ADT_OPT(map_opt, par->family, info->map_set.dim,
+ ADT_OPT(map_opt, xt_family(par), info->map_set.dim,
info->map_set.flags, 0, UINT_MAX);
/* Normalize to fit into jiffies */
@@ -453,14 +455,14 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
if (!ret)
return XT_CONTINUE;
if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBMARK)
- skb->mark = (skb->mark & ~(map_opt.ext.skbmarkmask))
- ^ (map_opt.ext.skbmark);
+ skb->mark = (skb->mark & ~MOPT(map_opt,skbmarkmask))
+ ^ MOPT(map_opt, skbmark);
if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBPRIO)
- skb->priority = map_opt.ext.skbprio;
+ skb->priority = MOPT(map_opt, skbprio);
if ((map_opt.cmdflags & IPSET_FLAG_MAP_SKBQUEUE) &&
skb->dev &&
- skb->dev->real_num_tx_queues > map_opt.ext.skbqueue)
- skb_set_queue_mapping(skb, map_opt.ext.skbqueue);
+ skb->dev->real_num_tx_queues > MOPT(map_opt, skbqueue))
+ skb_set_queue_mapping(skb, MOPT(map_opt, skbqueue));
}
return XT_CONTINUE;
}
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index b10ade272b50..770bbec878f1 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -22,76 +22,14 @@
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
-#define XT_SOCKET_HAVE_IPV6 1
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <net/inet6_hashtables.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#endif
+#include <net/netfilter/nf_socket.h>
#include <linux/netfilter/xt_socket.h>
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-#define XT_SOCKET_HAVE_CONNTRACK 1
-#include <net/netfilter/nf_conntrack.h>
-#endif
-
-static int
-extract_icmp4_fields(const struct sk_buff *skb,
- u8 *protocol,
- __be32 *raddr,
- __be32 *laddr,
- __be16 *rport,
- __be16 *lport)
-{
- unsigned int outside_hdrlen = ip_hdrlen(skb);
- struct iphdr *inside_iph, _inside_iph;
- struct icmphdr *icmph, _icmph;
- __be16 *ports, _ports[2];
-
- icmph = skb_header_pointer(skb, outside_hdrlen,
- sizeof(_icmph), &_icmph);
- if (icmph == NULL)
- return 1;
-
- switch (icmph->type) {
- case ICMP_DEST_UNREACH:
- case ICMP_SOURCE_QUENCH:
- case ICMP_REDIRECT:
- case ICMP_TIME_EXCEEDED:
- case ICMP_PARAMETERPROB:
- break;
- default:
- return 1;
- }
-
- inside_iph = skb_header_pointer(skb, outside_hdrlen +
- sizeof(struct icmphdr),
- sizeof(_inside_iph), &_inside_iph);
- if (inside_iph == NULL)
- return 1;
-
- if (inside_iph->protocol != IPPROTO_TCP &&
- inside_iph->protocol != IPPROTO_UDP)
- return 1;
-
- ports = skb_header_pointer(skb, outside_hdrlen +
- sizeof(struct icmphdr) +
- (inside_iph->ihl << 2),
- sizeof(_ports), &_ports);
- if (ports == NULL)
- return 1;
-
- /* the inside IP packet is the one quoted from our side, thus
- * its saddr is the local address */
- *protocol = inside_iph->protocol;
- *laddr = inside_iph->saddr;
- *lport = ports[0];
- *raddr = inside_iph->daddr;
- *rport = ports[1];
-
- return 0;
-}
-
/* "socket" match based redirection (no specific rule)
* ===================================================
*
@@ -111,104 +49,6 @@ extract_icmp4_fields(const struct sk_buff *skb,
* then local services could intercept traffic going through the
* box.
*/
-static struct sock *
-xt_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
- const u8 protocol,
- const __be32 saddr, const __be32 daddr,
- const __be16 sport, const __be16 dport,
- const struct net_device *in)
-{
- switch (protocol) {
- case IPPROTO_TCP:
- return inet_lookup(net, &tcp_hashinfo, skb, doff,
- saddr, sport, daddr, dport,
- in->ifindex);
- case IPPROTO_UDP:
- return udp4_lib_lookup(net, saddr, sport, daddr, dport,
- in->ifindex);
- }
- return NULL;
-}
-
-static bool xt_socket_sk_is_transparent(struct sock *sk)
-{
- switch (sk->sk_state) {
- case TCP_TIME_WAIT:
- return inet_twsk(sk)->tw_transparent;
-
- case TCP_NEW_SYN_RECV:
- return inet_rsk(inet_reqsk(sk))->no_srccheck;
-
- default:
- return inet_sk(sk)->transparent;
- }
-}
-
-static struct sock *xt_socket_lookup_slow_v4(struct net *net,
- const struct sk_buff *skb,
- const struct net_device *indev)
-{
- const struct iphdr *iph = ip_hdr(skb);
- struct sk_buff *data_skb = NULL;
- int doff = 0;
- __be32 uninitialized_var(daddr), uninitialized_var(saddr);
- __be16 uninitialized_var(dport), uninitialized_var(sport);
- u8 uninitialized_var(protocol);
-#ifdef XT_SOCKET_HAVE_CONNTRACK
- struct nf_conn const *ct;
- enum ip_conntrack_info ctinfo;
-#endif
-
- if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) {
- struct udphdr _hdr, *hp;
-
- hp = skb_header_pointer(skb, ip_hdrlen(skb),
- sizeof(_hdr), &_hdr);
- if (hp == NULL)
- return NULL;
-
- protocol = iph->protocol;
- saddr = iph->saddr;
- sport = hp->source;
- daddr = iph->daddr;
- dport = hp->dest;
- data_skb = (struct sk_buff *)skb;
- doff = iph->protocol == IPPROTO_TCP ?
- ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) :
- ip_hdrlen(skb) + sizeof(*hp);
-
- } else if (iph->protocol == IPPROTO_ICMP) {
- if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
- &sport, &dport))
- return NULL;
- } else {
- return NULL;
- }
-
-#ifdef XT_SOCKET_HAVE_CONNTRACK
- /* Do the lookup with the original socket address in
- * case this is a reply packet of an established
- * SNAT-ted connection.
- */
- ct = nf_ct_get(skb, &ctinfo);
- if (ct && !nf_ct_is_untracked(ct) &&
- ((iph->protocol != IPPROTO_ICMP &&
- ctinfo == IP_CT_ESTABLISHED_REPLY) ||
- (iph->protocol == IPPROTO_ICMP &&
- ctinfo == IP_CT_RELATED_REPLY)) &&
- (ct->status & IPS_SRC_NAT_DONE)) {
-
- daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
- dport = (iph->protocol == IPPROTO_TCP) ?
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port :
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
- }
-#endif
-
- return xt_socket_get_sock_v4(net, data_skb, doff, protocol, saddr,
- daddr, sport, dport, indev);
-}
-
static bool
socket_match(const struct sk_buff *skb, struct xt_action_param *par,
const struct xt_socket_mtinfo1 *info)
@@ -217,7 +57,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
struct sock *sk = skb->sk;
if (!sk)
- sk = xt_socket_lookup_slow_v4(par->net, skb, par->in);
+ sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par));
if (sk) {
bool wildcard;
bool transparent = true;
@@ -233,7 +73,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
* if XT_SOCKET_TRANSPARENT is used
*/
if (info->flags & XT_SOCKET_TRANSPARENT)
- transparent = xt_socket_sk_is_transparent(sk);
+ transparent = nf_sk_is_transparent(sk);
if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
transparent)
@@ -265,132 +105,7 @@ socket_mt4_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
return socket_match(skb, par, par->matchinfo);
}
-#ifdef XT_SOCKET_HAVE_IPV6
-
-static int
-extract_icmp6_fields(const struct sk_buff *skb,
- unsigned int outside_hdrlen,
- int *protocol,
- const struct in6_addr **raddr,
- const struct in6_addr **laddr,
- __be16 *rport,
- __be16 *lport,
- struct ipv6hdr *ipv6_var)
-{
- const struct ipv6hdr *inside_iph;
- struct icmp6hdr *icmph, _icmph;
- __be16 *ports, _ports[2];
- u8 inside_nexthdr;
- __be16 inside_fragoff;
- int inside_hdrlen;
-
- icmph = skb_header_pointer(skb, outside_hdrlen,
- sizeof(_icmph), &_icmph);
- if (icmph == NULL)
- return 1;
-
- if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK)
- return 1;
-
- inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph),
- sizeof(*ipv6_var), ipv6_var);
- if (inside_iph == NULL)
- return 1;
- inside_nexthdr = inside_iph->nexthdr;
-
- inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) +
- sizeof(*ipv6_var),
- &inside_nexthdr, &inside_fragoff);
- if (inside_hdrlen < 0)
- return 1; /* hjm: Packet has no/incomplete transport layer headers. */
-
- if (inside_nexthdr != IPPROTO_TCP &&
- inside_nexthdr != IPPROTO_UDP)
- return 1;
-
- ports = skb_header_pointer(skb, inside_hdrlen,
- sizeof(_ports), &_ports);
- if (ports == NULL)
- return 1;
-
- /* the inside IP packet is the one quoted from our side, thus
- * its saddr is the local address */
- *protocol = inside_nexthdr;
- *laddr = &inside_iph->saddr;
- *lport = ports[0];
- *raddr = &inside_iph->daddr;
- *rport = ports[1];
-
- return 0;
-}
-
-static struct sock *
-xt_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
- const u8 protocol,
- const struct in6_addr *saddr, const struct in6_addr *daddr,
- const __be16 sport, const __be16 dport,
- const struct net_device *in)
-{
- switch (protocol) {
- case IPPROTO_TCP:
- return inet6_lookup(net, &tcp_hashinfo, skb, doff,
- saddr, sport, daddr, dport,
- in->ifindex);
- case IPPROTO_UDP:
- return udp6_lib_lookup(net, saddr, sport, daddr, dport,
- in->ifindex);
- }
-
- return NULL;
-}
-
-static struct sock *xt_socket_lookup_slow_v6(struct net *net,
- const struct sk_buff *skb,
- const struct net_device *indev)
-{
- __be16 uninitialized_var(dport), uninitialized_var(sport);
- const struct in6_addr *daddr = NULL, *saddr = NULL;
- struct ipv6hdr *iph = ipv6_hdr(skb);
- struct sk_buff *data_skb = NULL;
- int doff = 0;
- int thoff = 0, tproto;
-
- tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
- if (tproto < 0) {
- pr_debug("unable to find transport header in IPv6 packet, dropping\n");
- return NULL;
- }
-
- if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) {
- struct udphdr _hdr, *hp;
-
- hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
- if (hp == NULL)
- return NULL;
-
- saddr = &iph->saddr;
- sport = hp->source;
- daddr = &iph->daddr;
- dport = hp->dest;
- data_skb = (struct sk_buff *)skb;
- doff = tproto == IPPROTO_TCP ?
- thoff + __tcp_hdrlen((struct tcphdr *)hp) :
- thoff + sizeof(*hp);
-
- } else if (tproto == IPPROTO_ICMPV6) {
- struct ipv6hdr ipv6_var;
-
- if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
- &sport, &dport, &ipv6_var))
- return NULL;
- } else {
- return NULL;
- }
-
- return xt_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr,
- sport, dport, indev);
-}
-
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
static bool
socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
{
@@ -399,7 +114,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
struct sock *sk = skb->sk;
if (!sk)
- sk = xt_socket_lookup_slow_v6(par->net, skb, par->in);
+ sk = nf_sk_lookup_slow_v6(xt_net(par), skb, xt_in(par));
if (sk) {
bool wildcard;
bool transparent = true;
@@ -415,7 +130,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
* if XT_SOCKET_TRANSPARENT is used
*/
if (info->flags & XT_SOCKET_TRANSPARENT)
- transparent = xt_socket_sk_is_transparent(sk);
+ transparent = nf_sk_is_transparent(sk);
if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
transparent)
@@ -432,9 +147,28 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
}
#endif
+static int socket_mt_enable_defrag(struct net *net, int family)
+{
+ switch (family) {
+ case NFPROTO_IPV4:
+ return nf_defrag_ipv4_enable(net);
+#ifdef XT_SOCKET_HAVE_IPV6
+ case NFPROTO_IPV6:
+ return nf_defrag_ipv6_enable(net);
+#endif
+ }
+ WARN_ONCE(1, "Unknown family %d\n", family);
+ return 0;
+}
+
static int socket_mt_v1_check(const struct xt_mtchk_param *par)
{
const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+ int err;
+
+ err = socket_mt_enable_defrag(par->net, par->family);
+ if (err)
+ return err;
if (info->flags & ~XT_SOCKET_FLAGS_V1) {
pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1);
@@ -446,6 +180,11 @@ static int socket_mt_v1_check(const struct xt_mtchk_param *par)
static int socket_mt_v2_check(const struct xt_mtchk_param *par)
{
const struct xt_socket_mtinfo2 *info = (struct xt_socket_mtinfo2 *) par->matchinfo;
+ int err;
+
+ err = socket_mt_enable_defrag(par->net, par->family);
+ if (err)
+ return err;
if (info->flags & ~XT_SOCKET_FLAGS_V2) {
pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2);
@@ -458,7 +197,11 @@ static int socket_mt_v3_check(const struct xt_mtchk_param *par)
{
const struct xt_socket_mtinfo3 *info =
(struct xt_socket_mtinfo3 *)par->matchinfo;
+ int err;
+ err = socket_mt_enable_defrag(par->net, par->family);
+ if (err)
+ return err;
if (info->flags & ~XT_SOCKET_FLAGS_V3) {
pr_info("unknown flags 0x%x\n",
info->flags & ~XT_SOCKET_FLAGS_V3);
@@ -488,7 +231,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
},
-#ifdef XT_SOCKET_HAVE_IPV6
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
{
.name = "socket",
.revision = 1,
@@ -512,7 +255,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
},
-#ifdef XT_SOCKET_HAVE_IPV6
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
{
.name = "socket",
.revision = 2,
@@ -536,7 +279,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
},
-#ifdef XT_SOCKET_HAVE_IPV6
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
{
.name = "socket",
.revision = 3,
@@ -553,11 +296,6 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
static int __init socket_mt_init(void)
{
- nf_defrag_ipv4_enable();
-#ifdef XT_SOCKET_HAVE_IPV6
- nf_defrag_ipv6_enable();
-#endif
-
return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
}
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index a507922d80cd..5746a33789a5 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -43,7 +43,7 @@ static int state_mt_check(const struct xt_mtchk_param *par)
{
int ret;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -52,7 +52,7 @@ static int state_mt_check(const struct xt_mtchk_param *par)
static void state_mt_destroy(const struct xt_mtdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_match state_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index 0bc3460319c8..423293ee57c2 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -77,6 +77,7 @@ static struct xt_match xt_string_mt_reg __read_mostly = {
.match = string_mt,
.destroy = string_mt_destroy,
.matchsize = sizeof(struct xt_string_info),
+ .usersize = offsetof(struct xt_string_info, config),
.me = THIS_MODULE,
};
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index 0ae55a36f492..1b01eec1fbda 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -168,7 +168,7 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par)
* may happen that the same packet matches both rules if
* it arrived at the right moment before 13:00.
*/
- if (skb->tstamp.tv64 == 0)
+ if (skb->tstamp == 0)
__net_timestamp((struct sk_buff *)skb);
stamp = ktime_to_ns(skb->tstamp);
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index 2ec93c5e77bb..d177dd066504 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -60,13 +60,7 @@ struct netlbl_domhsh_walk_arg {
};
/* NetLabel Generic NETLINK CALIPSO family */
-static struct genl_family netlbl_calipso_gnl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = NETLBL_NLTYPE_CALIPSO_NAME,
- .version = NETLBL_PROTO_VERSION,
- .maxattr = NLBL_CALIPSO_A_MAX,
-};
+static struct genl_family netlbl_calipso_gnl_family;
/* NetLabel Netlink attribute policy */
static const struct nla_policy calipso_genl_policy[NLBL_CALIPSO_A_MAX + 1] = {
@@ -355,6 +349,16 @@ static const struct genl_ops netlbl_calipso_ops[] = {
},
};
+static struct genl_family netlbl_calipso_gnl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = NETLBL_NLTYPE_CALIPSO_NAME,
+ .version = NETLBL_PROTO_VERSION,
+ .maxattr = NLBL_CALIPSO_A_MAX,
+ .module = THIS_MODULE,
+ .ops = netlbl_calipso_ops,
+ .n_ops = ARRAY_SIZE(netlbl_calipso_ops),
+};
+
/* NetLabel Generic NETLINK Protocol Functions
*/
@@ -368,8 +372,7 @@ static const struct genl_ops netlbl_calipso_ops[] = {
*/
int __init netlbl_calipso_genl_init(void)
{
- return genl_register_family_with_ops(&netlbl_calipso_gnl_family,
- netlbl_calipso_ops);
+ return genl_register_family(&netlbl_calipso_gnl_family);
}
static const struct netlbl_calipso_ops *calipso_ops;
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index 7fd1104ba900..4149d3e63589 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -59,14 +59,7 @@ struct netlbl_domhsh_walk_arg {
};
/* NetLabel Generic NETLINK CIPSOv4 family */
-static struct genl_family netlbl_cipsov4_gnl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = NETLBL_NLTYPE_CIPSOV4_NAME,
- .version = NETLBL_PROTO_VERSION,
- .maxattr = NLBL_CIPSOV4_A_MAX,
-};
-
+static struct genl_family netlbl_cipsov4_gnl_family;
/* NetLabel Netlink attribute policy */
static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1] = {
[NLBL_CIPSOV4_A_DOI] = { .type = NLA_U32 },
@@ -767,6 +760,16 @@ static const struct genl_ops netlbl_cipsov4_ops[] = {
},
};
+static struct genl_family netlbl_cipsov4_gnl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = NETLBL_NLTYPE_CIPSOV4_NAME,
+ .version = NETLBL_PROTO_VERSION,
+ .maxattr = NLBL_CIPSOV4_A_MAX,
+ .module = THIS_MODULE,
+ .ops = netlbl_cipsov4_ops,
+ .n_ops = ARRAY_SIZE(netlbl_cipsov4_ops),
+};
+
/*
* NetLabel Generic NETLINK Protocol Functions
*/
@@ -781,6 +784,5 @@ static const struct genl_ops netlbl_cipsov4_ops[] = {
*/
int __init netlbl_cipsov4_genl_init(void)
{
- return genl_register_family_with_ops(&netlbl_cipsov4_gnl_family,
- netlbl_cipsov4_ops);
+ return genl_register_family(&netlbl_cipsov4_gnl_family);
}
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 28c56b95fb7f..ea7c67050792 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -1502,10 +1502,7 @@ static int __init netlbl_init(void)
printk(KERN_INFO "NetLabel: Initializing\n");
printk(KERN_INFO "NetLabel: domain hash size = %u\n",
(1 << NETLBL_DOMHSH_BITSIZE));
- printk(KERN_INFO "NetLabel: protocols ="
- " UNLABELED"
- " CIPSOv4"
- "\n");
+ printk(KERN_INFO "NetLabel: protocols = UNLABELED CIPSOv4 CALIPSO\n");
ret_val = netlbl_domhsh_init(NETLBL_DOMHSH_BITSIZE);
if (ret_val != 0)
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index f85d0e07af2d..21e0095b1d14 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -60,13 +60,7 @@ struct netlbl_domhsh_walk_arg {
};
/* NetLabel Generic NETLINK CIPSOv4 family */
-static struct genl_family netlbl_mgmt_gnl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = NETLBL_NLTYPE_MGMT_NAME,
- .version = NETLBL_PROTO_VERSION,
- .maxattr = NLBL_MGMT_A_MAX,
-};
+static struct genl_family netlbl_mgmt_gnl_family;
/* NetLabel Netlink attribute policy */
static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = {
@@ -834,6 +828,16 @@ static const struct genl_ops netlbl_mgmt_genl_ops[] = {
},
};
+static struct genl_family netlbl_mgmt_gnl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = NETLBL_NLTYPE_MGMT_NAME,
+ .version = NETLBL_PROTO_VERSION,
+ .maxattr = NLBL_MGMT_A_MAX,
+ .module = THIS_MODULE,
+ .ops = netlbl_mgmt_genl_ops,
+ .n_ops = ARRAY_SIZE(netlbl_mgmt_genl_ops),
+};
+
/*
* NetLabel Generic NETLINK Protocol Functions
*/
@@ -848,6 +852,5 @@ static const struct genl_ops netlbl_mgmt_genl_ops[] = {
*/
int __init netlbl_mgmt_genl_init(void)
{
- return genl_register_family_with_ops(&netlbl_mgmt_gnl_family,
- netlbl_mgmt_genl_ops);
+ return genl_register_family(&netlbl_mgmt_gnl_family);
}
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 4528cff9138b..22dc1b9d6362 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -123,13 +123,7 @@ static struct netlbl_unlhsh_iface __rcu *netlbl_unlhsh_def;
static u8 netlabel_unlabel_acceptflg;
/* NetLabel Generic NETLINK unlabeled family */
-static struct genl_family netlbl_unlabel_gnl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = NETLBL_NLTYPE_UNLABELED_NAME,
- .version = NETLBL_PROTO_VERSION,
- .maxattr = NLBL_UNLABEL_A_MAX,
-};
+static struct genl_family netlbl_unlabel_gnl_family;
/* NetLabel Netlink attribute policy */
static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
@@ -1378,6 +1372,16 @@ static const struct genl_ops netlbl_unlabel_genl_ops[] = {
},
};
+static struct genl_family netlbl_unlabel_gnl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = NETLBL_NLTYPE_UNLABELED_NAME,
+ .version = NETLBL_PROTO_VERSION,
+ .maxattr = NLBL_UNLABEL_A_MAX,
+ .module = THIS_MODULE,
+ .ops = netlbl_unlabel_genl_ops,
+ .n_ops = ARRAY_SIZE(netlbl_unlabel_genl_ops),
+};
+
/*
* NetLabel Generic NETLINK Protocol Functions
*/
@@ -1392,8 +1396,7 @@ static const struct genl_ops netlbl_unlabel_genl_ops[] = {
*/
int __init netlbl_unlabel_genl_init(void)
{
- return genl_register_family_with_ops(&netlbl_unlabel_gnl_family,
- netlbl_unlabel_genl_ops);
+ return genl_register_family(&netlbl_unlabel_gnl_family);
}
/*
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 246f29d365c0..596eaff66649 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -40,7 +40,7 @@
#include <linux/net.h>
#include <linux/fs.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
@@ -96,6 +96,44 @@ EXPORT_SYMBOL_GPL(nl_table);
static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
+static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS];
+
+static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = {
+ "nlk_cb_mutex-ROUTE",
+ "nlk_cb_mutex-1",
+ "nlk_cb_mutex-USERSOCK",
+ "nlk_cb_mutex-FIREWALL",
+ "nlk_cb_mutex-SOCK_DIAG",
+ "nlk_cb_mutex-NFLOG",
+ "nlk_cb_mutex-XFRM",
+ "nlk_cb_mutex-SELINUX",
+ "nlk_cb_mutex-ISCSI",
+ "nlk_cb_mutex-AUDIT",
+ "nlk_cb_mutex-FIB_LOOKUP",
+ "nlk_cb_mutex-CONNECTOR",
+ "nlk_cb_mutex-NETFILTER",
+ "nlk_cb_mutex-IP6_FW",
+ "nlk_cb_mutex-DNRTMSG",
+ "nlk_cb_mutex-KOBJECT_UEVENT",
+ "nlk_cb_mutex-GENERIC",
+ "nlk_cb_mutex-17",
+ "nlk_cb_mutex-SCSITRANSPORT",
+ "nlk_cb_mutex-ECRYPTFS",
+ "nlk_cb_mutex-RDMA",
+ "nlk_cb_mutex-CRYPTO",
+ "nlk_cb_mutex-SMC",
+ "nlk_cb_mutex-23",
+ "nlk_cb_mutex-24",
+ "nlk_cb_mutex-25",
+ "nlk_cb_mutex-26",
+ "nlk_cb_mutex-27",
+ "nlk_cb_mutex-28",
+ "nlk_cb_mutex-29",
+ "nlk_cb_mutex-30",
+ "nlk_cb_mutex-31",
+ "nlk_cb_mutex-MAX_LINKS"
+};
+
static int netlink_dump(struct sock *sk);
static void netlink_skb_destructor(struct sk_buff *skb);
@@ -113,7 +151,7 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));
-static ATOMIC_NOTIFIER_HEAD(netlink_chain);
+static BLOCKING_NOTIFIER_HEAD(netlink_chain);
static DEFINE_SPINLOCK(netlink_tap_lock);
static struct list_head netlink_tap_all __read_mostly;
@@ -585,6 +623,9 @@ static int __netlink_create(struct net *net, struct socket *sock,
} else {
nlk->cb_mutex = &nlk->cb_def_mutex;
mutex_init(nlk->cb_mutex);
+ lockdep_set_class_and_name(nlk->cb_mutex,
+ nlk_cb_mutex_keys + protocol,
+ nlk_cb_mutex_key_strings[protocol]);
}
init_waitqueue_head(&nlk->wait);
@@ -711,7 +752,7 @@ static int netlink_release(struct socket *sock)
.protocol = sk->sk_protocol,
.portid = nlk->portid,
};
- atomic_notifier_call_chain(&netlink_chain,
+ blocking_notifier_call_chain(&netlink_chain,
NETLINK_URELEASE, &n);
}
@@ -1210,9 +1251,9 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
skb = nskb;
}
- if (!pskb_expand_head(skb, 0, -delta, allocation))
- skb->truesize -= delta;
-
+ pskb_expand_head(skb, 0, -delta,
+ (allocation & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_NOWARN | __GFP_NORETRY);
return skb;
}
@@ -2504,13 +2545,13 @@ static const struct file_operations netlink_seq_fops = {
int netlink_register_notifier(struct notifier_block *nb)
{
- return atomic_notifier_chain_register(&netlink_chain, nb);
+ return blocking_notifier_chain_register(&netlink_chain, nb);
}
EXPORT_SYMBOL(netlink_register_notifier);
int netlink_unregister_notifier(struct notifier_block *nb)
{
- return atomic_notifier_chain_unregister(&netlink_chain, nb);
+ return blocking_notifier_chain_unregister(&netlink_chain, nb);
}
EXPORT_SYMBOL(netlink_unregister_notifier);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 49c28e8ef01b..92e0981f7404 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -17,6 +17,7 @@
#include <linux/mutex.h>
#include <linux/bitmap.h>
#include <linux/rwsem.h>
+#include <linux/idr.h>
#include <net/sock.h>
#include <net/genetlink.h>
@@ -58,10 +59,8 @@ static void genl_unlock_all(void)
up_write(&cb_lock);
}
-#define GENL_FAM_TAB_SIZE 16
-#define GENL_FAM_TAB_MASK (GENL_FAM_TAB_SIZE - 1)
+static DEFINE_IDR(genl_fam_idr);
-static struct list_head family_ht[GENL_FAM_TAB_SIZE];
/*
* Bitmap of multicast groups that are currently in use.
*
@@ -86,45 +85,29 @@ static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) |
static unsigned long *mc_groups = &mc_group_start;
static unsigned long mc_groups_longs = 1;
-static int genl_ctrl_event(int event, struct genl_family *family,
+static int genl_ctrl_event(int event, const struct genl_family *family,
const struct genl_multicast_group *grp,
int grp_id);
-static inline unsigned int genl_family_hash(unsigned int id)
+static const struct genl_family *genl_family_find_byid(unsigned int id)
{
- return id & GENL_FAM_TAB_MASK;
+ return idr_find(&genl_fam_idr, id);
}
-static inline struct list_head *genl_family_chain(unsigned int id)
+static const struct genl_family *genl_family_find_byname(char *name)
{
- return &family_ht[genl_family_hash(id)];
-}
-
-static struct genl_family *genl_family_find_byid(unsigned int id)
-{
- struct genl_family *f;
-
- list_for_each_entry(f, genl_family_chain(id), family_list)
- if (f->id == id)
- return f;
-
- return NULL;
-}
-
-static struct genl_family *genl_family_find_byname(char *name)
-{
- struct genl_family *f;
- int i;
+ const struct genl_family *family;
+ unsigned int id;
- for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
- list_for_each_entry(f, genl_family_chain(i), family_list)
- if (strcmp(f->name, name) == 0)
- return f;
+ idr_for_each_entry(&genl_fam_idr, family, id)
+ if (strcmp(family->name, name) == 0)
+ return family;
return NULL;
}
-static const struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family)
+static const struct genl_ops *genl_get_cmd(u8 cmd,
+ const struct genl_family *family)
{
int i;
@@ -135,26 +118,6 @@ static const struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family)
return NULL;
}
-/* Of course we are going to have problems once we hit
- * 2^16 alive types, but that can only happen by year 2K
-*/
-static u16 genl_generate_id(void)
-{
- static u16 id_gen_idx = GENL_MIN_ID;
- int i;
-
- for (i = 0; i <= GENL_MAX_ID - GENL_MIN_ID; i++) {
- if (id_gen_idx != GENL_ID_VFS_DQUOT &&
- id_gen_idx != GENL_ID_PMCRAID &&
- !genl_family_find_byid(id_gen_idx))
- return id_gen_idx;
- if (++id_gen_idx > GENL_MAX_ID)
- id_gen_idx = GENL_MIN_ID;
- }
-
- return 0;
-}
-
static int genl_allocate_reserve_groups(int n_groups, int *first_id)
{
unsigned long *new_groups;
@@ -295,7 +258,7 @@ static int genl_validate_assign_mc_groups(struct genl_family *family)
return err;
}
-static void genl_unregister_mc_groups(struct genl_family *family)
+static void genl_unregister_mc_groups(const struct genl_family *family)
{
struct net *net;
int i;
@@ -344,28 +307,21 @@ static int genl_validate_ops(const struct genl_family *family)
}
/**
- * __genl_register_family - register a generic netlink family
+ * genl_register_family - register a generic netlink family
* @family: generic netlink family
*
* Registers the specified family after validating it first. Only one
* family may be registered with the same family name or identifier.
- * The family id may equal GENL_ID_GENERATE causing an unique id to
- * be automatically generated and assigned.
*
- * The family's ops array must already be assigned, you can use the
- * genl_register_family_with_ops() helper function.
+ * The family's ops, multicast groups and module pointer must already
+ * be assigned.
*
* Return 0 on success or a negative error code.
*/
-int __genl_register_family(struct genl_family *family)
+int genl_register_family(struct genl_family *family)
{
- int err = -EINVAL, i;
-
- if (family->id && family->id < GENL_MIN_ID)
- goto errout;
-
- if (family->id > GENL_MAX_ID)
- goto errout;
+ int err, i;
+ int start = GENL_START_ALLOC, end = GENL_MAX_ID;
err = genl_validate_ops(family);
if (err)
@@ -378,18 +334,20 @@ int __genl_register_family(struct genl_family *family)
goto errout_locked;
}
- if (family->id == GENL_ID_GENERATE) {
- u16 newid = genl_generate_id();
-
- if (!newid) {
- err = -ENOMEM;
- goto errout_locked;
- }
-
- family->id = newid;
- } else if (genl_family_find_byid(family->id)) {
- err = -EEXIST;
- goto errout_locked;
+ /*
+ * Sadly, a few cases need to be special-cased
+ * due to them having previously abused the API
+ * and having used their family ID also as their
+ * multicast group ID, so we use reserved IDs
+ * for both to be sure we can do that mapping.
+ */
+ if (family == &genl_ctrl) {
+ /* and this needs to be special for initial family lookups */
+ start = end = GENL_ID_CTRL;
+ } else if (strcmp(family->name, "pmcraid") == 0) {
+ start = end = GENL_ID_PMCRAID;
+ } else if (strcmp(family->name, "VFS_DQUOT") == 0) {
+ start = end = GENL_ID_VFS_DQUOT;
}
if (family->maxattr && !family->parallel_ops) {
@@ -402,11 +360,17 @@ int __genl_register_family(struct genl_family *family)
} else
family->attrbuf = NULL;
+ family->id = idr_alloc(&genl_fam_idr, family,
+ start, end + 1, GFP_KERNEL);
+ if (family->id < 0) {
+ err = family->id;
+ goto errout_locked;
+ }
+
err = genl_validate_assign_mc_groups(family);
if (err)
- goto errout_free;
+ goto errout_remove;
- list_add_tail(&family->family_list, genl_family_chain(family->id));
genl_unlock_all();
/* send all events */
@@ -417,14 +381,14 @@ int __genl_register_family(struct genl_family *family)
return 0;
-errout_free:
+errout_remove:
+ idr_remove(&genl_fam_idr, family->id);
kfree(family->attrbuf);
errout_locked:
genl_unlock_all();
-errout:
return err;
}
-EXPORT_SYMBOL(__genl_register_family);
+EXPORT_SYMBOL(genl_register_family);
/**
* genl_unregister_family - unregister generic netlink family
@@ -434,33 +398,29 @@ EXPORT_SYMBOL(__genl_register_family);
*
* Returns 0 on success or a negative error code.
*/
-int genl_unregister_family(struct genl_family *family)
+int genl_unregister_family(const struct genl_family *family)
{
- struct genl_family *rc;
-
genl_lock_all();
- list_for_each_entry(rc, genl_family_chain(family->id), family_list) {
- if (family->id != rc->id || strcmp(rc->name, family->name))
- continue;
+ if (!genl_family_find_byid(family->id)) {
+ genl_unlock_all();
+ return -ENOENT;
+ }
- genl_unregister_mc_groups(family);
+ genl_unregister_mc_groups(family);
- list_del(&rc->family_list);
- family->n_ops = 0;
- up_write(&cb_lock);
- wait_event(genl_sk_destructing_waitq,
- atomic_read(&genl_sk_destructing_cnt) == 0);
- genl_unlock();
+ idr_remove(&genl_fam_idr, family->id);
- kfree(family->attrbuf);
- genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0);
- return 0;
- }
+ up_write(&cb_lock);
+ wait_event(genl_sk_destructing_waitq,
+ atomic_read(&genl_sk_destructing_cnt) == 0);
+ genl_unlock();
- genl_unlock_all();
+ kfree(family->attrbuf);
+
+ genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0);
- return -ENOENT;
+ return 0;
}
EXPORT_SYMBOL(genl_unregister_family);
@@ -476,7 +436,7 @@ EXPORT_SYMBOL(genl_unregister_family);
* Returns pointer to user specific header
*/
void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
- struct genl_family *family, int flags, u8 cmd)
+ const struct genl_family *family, int flags, u8 cmd)
{
struct nlmsghdr *nlh;
struct genlmsghdr *hdr;
@@ -535,7 +495,7 @@ static int genl_lock_done(struct netlink_callback *cb)
return rc;
}
-static int genl_family_rcv_msg(struct genl_family *family,
+static int genl_family_rcv_msg(const struct genl_family *family,
struct sk_buff *skb,
struct nlmsghdr *nlh)
{
@@ -647,7 +607,7 @@ out:
static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
- struct genl_family *family;
+ const struct genl_family *family;
int err;
family = genl_family_find_byid(nlh->nlmsg_type);
@@ -676,15 +636,9 @@ static void genl_rcv(struct sk_buff *skb)
* Controller
**************************************************************************/
-static struct genl_family genl_ctrl = {
- .id = GENL_ID_CTRL,
- .name = "nlctrl",
- .version = 0x2,
- .maxattr = CTRL_ATTR_MAX,
- .netnsok = true,
-};
+static struct genl_family genl_ctrl;
-static int ctrl_fill_info(struct genl_family *family, u32 portid, u32 seq,
+static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq,
u32 flags, struct sk_buff *skb, u8 cmd)
{
void *hdr;
@@ -771,7 +725,7 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int ctrl_fill_mcgrp_info(struct genl_family *family,
+static int ctrl_fill_mcgrp_info(const struct genl_family *family,
const struct genl_multicast_group *grp,
int grp_id, u32 portid, u32 seq, u32 flags,
struct sk_buff *skb, u8 cmd)
@@ -814,37 +768,32 @@ nla_put_failure:
static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
{
-
- int i, n = 0;
+ int n = 0;
struct genl_family *rt;
struct net *net = sock_net(skb->sk);
- int chains_to_skip = cb->args[0];
- int fams_to_skip = cb->args[1];
-
- for (i = chains_to_skip; i < GENL_FAM_TAB_SIZE; i++) {
- n = 0;
- list_for_each_entry(rt, genl_family_chain(i), family_list) {
- if (!rt->netnsok && !net_eq(net, &init_net))
- continue;
- if (++n < fams_to_skip)
- continue;
- if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI,
- skb, CTRL_CMD_NEWFAMILY) < 0)
- goto errout;
- }
+ int fams_to_skip = cb->args[0];
+ unsigned int id;
- fams_to_skip = 0;
- }
+ idr_for_each_entry(&genl_fam_idr, rt, id) {
+ if (!rt->netnsok && !net_eq(net, &init_net))
+ continue;
-errout:
- cb->args[0] = i;
- cb->args[1] = n;
+ if (n++ < fams_to_skip)
+ continue;
+ if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ skb, CTRL_CMD_NEWFAMILY) < 0) {
+ n--;
+ break;
+ }
+ }
+
+ cb->args[0] = n;
return skb->len;
}
-static struct sk_buff *ctrl_build_family_msg(struct genl_family *family,
+static struct sk_buff *ctrl_build_family_msg(const struct genl_family *family,
u32 portid, int seq, u8 cmd)
{
struct sk_buff *skb;
@@ -864,7 +813,7 @@ static struct sk_buff *ctrl_build_family_msg(struct genl_family *family,
}
static struct sk_buff *
-ctrl_build_mcgrp_msg(struct genl_family *family,
+ctrl_build_mcgrp_msg(const struct genl_family *family,
const struct genl_multicast_group *grp,
int grp_id, u32 portid, int seq, u8 cmd)
{
@@ -894,7 +843,7 @@ static const struct nla_policy ctrl_policy[CTRL_ATTR_MAX+1] = {
static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info)
{
struct sk_buff *msg;
- struct genl_family *res = NULL;
+ const struct genl_family *res = NULL;
int err = -EINVAL;
if (info->attrs[CTRL_ATTR_FAMILY_ID]) {
@@ -938,7 +887,7 @@ static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info)
return genlmsg_reply(msg, info);
}
-static int genl_ctrl_event(int event, struct genl_family *family,
+static int genl_ctrl_event(int event, const struct genl_family *family,
const struct genl_multicast_group *grp,
int grp_id)
{
@@ -992,27 +941,39 @@ static const struct genl_multicast_group genl_ctrl_groups[] = {
{ .name = "notify", },
};
+static struct genl_family genl_ctrl __ro_after_init = {
+ .module = THIS_MODULE,
+ .ops = genl_ctrl_ops,
+ .n_ops = ARRAY_SIZE(genl_ctrl_ops),
+ .mcgrps = genl_ctrl_groups,
+ .n_mcgrps = ARRAY_SIZE(genl_ctrl_groups),
+ .id = GENL_ID_CTRL,
+ .name = "nlctrl",
+ .version = 0x2,
+ .maxattr = CTRL_ATTR_MAX,
+ .netnsok = true,
+};
+
static int genl_bind(struct net *net, int group)
{
- int i, err = -ENOENT;
+ struct genl_family *f;
+ int err = -ENOENT;
+ unsigned int id;
down_read(&cb_lock);
- for (i = 0; i < GENL_FAM_TAB_SIZE; i++) {
- struct genl_family *f;
-
- list_for_each_entry(f, genl_family_chain(i), family_list) {
- if (group >= f->mcgrp_offset &&
- group < f->mcgrp_offset + f->n_mcgrps) {
- int fam_grp = group - f->mcgrp_offset;
-
- if (!f->netnsok && net != &init_net)
- err = -ENOENT;
- else if (f->mcast_bind)
- err = f->mcast_bind(net, fam_grp);
- else
- err = 0;
- break;
- }
+
+ idr_for_each_entry(&genl_fam_idr, f, id) {
+ if (group >= f->mcgrp_offset &&
+ group < f->mcgrp_offset + f->n_mcgrps) {
+ int fam_grp = group - f->mcgrp_offset;
+
+ if (!f->netnsok && net != &init_net)
+ err = -ENOENT;
+ else if (f->mcast_bind)
+ err = f->mcast_bind(net, fam_grp);
+ else
+ err = 0;
+ break;
}
}
up_read(&cb_lock);
@@ -1022,21 +983,19 @@ static int genl_bind(struct net *net, int group)
static void genl_unbind(struct net *net, int group)
{
- int i;
+ struct genl_family *f;
+ unsigned int id;
down_read(&cb_lock);
- for (i = 0; i < GENL_FAM_TAB_SIZE; i++) {
- struct genl_family *f;
- list_for_each_entry(f, genl_family_chain(i), family_list) {
- if (group >= f->mcgrp_offset &&
- group < f->mcgrp_offset + f->n_mcgrps) {
- int fam_grp = group - f->mcgrp_offset;
+ idr_for_each_entry(&genl_fam_idr, f, id) {
+ if (group >= f->mcgrp_offset &&
+ group < f->mcgrp_offset + f->n_mcgrps) {
+ int fam_grp = group - f->mcgrp_offset;
- if (f->mcast_unbind)
- f->mcast_unbind(net, fam_grp);
- break;
- }
+ if (f->mcast_unbind)
+ f->mcast_unbind(net, fam_grp);
+ break;
}
}
up_read(&cb_lock);
@@ -1076,13 +1035,9 @@ static struct pernet_operations genl_pernet_ops = {
static int __init genl_init(void)
{
- int i, err;
-
- for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
- INIT_LIST_HEAD(&family_ht[i]);
+ int err;
- err = genl_register_family_with_ops_groups(&genl_ctrl, genl_ctrl_ops,
- genl_ctrl_groups);
+ err = genl_register_family(&genl_ctrl);
if (err < 0)
goto problem;
@@ -1098,6 +1053,25 @@ problem:
subsys_initcall(genl_init);
+/**
+ * genl_family_attrbuf - return family's attrbuf
+ * @family: the family
+ *
+ * Return the family's attrbuf, while validating that it's
+ * actually valid to access it.
+ *
+ * You cannot use this function with a family that has parallel_ops
+ * and you can only use it within (pre/post) doit/dumpit callbacks.
+ */
+struct nlattr **genl_family_attrbuf(const struct genl_family *family)
+{
+ if (!WARN_ON(family->parallel_ops))
+ lockdep_assert_held(&genl_mutex);
+
+ return family->attrbuf;
+}
+EXPORT_SYMBOL(genl_family_attrbuf);
+
static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
gfp_t flags)
{
@@ -1127,8 +1101,9 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
return err;
}
-int genlmsg_multicast_allns(struct genl_family *family, struct sk_buff *skb,
- u32 portid, unsigned int group, gfp_t flags)
+int genlmsg_multicast_allns(const struct genl_family *family,
+ struct sk_buff *skb, u32 portid,
+ unsigned int group, gfp_t flags)
{
if (WARN_ON_ONCE(group >= family->n_mcgrps))
return -EINVAL;
@@ -1137,7 +1112,7 @@ int genlmsg_multicast_allns(struct genl_family *family, struct sk_buff *skb,
}
EXPORT_SYMBOL(genlmsg_multicast_allns);
-void genl_notify(struct genl_family *family, struct sk_buff *skb,
+void genl_notify(const struct genl_family *family, struct sk_buff *skb,
struct genl_info *info, u32 group, gfp_t flags)
{
struct net *net = genl_info_net(info);
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index ed212ffc1d9d..ebf16f7f9089 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -17,7 +17,7 @@
#include <linux/in.h>
#include <linux/slab.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
@@ -765,7 +765,8 @@ out_release:
return err;
}
-static int nr_accept(struct socket *sock, struct socket *newsock, int flags)
+static int nr_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern)
{
struct sk_buff *skb;
struct sock *newsk;
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index b9edf5fae6ae..2ffb18e73df6 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -21,6 +21,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/nfc.h>
+#include <linux/sched/signal.h>
#include "nfc.h"
#include "llcp.h"
@@ -440,7 +441,7 @@ struct sock *nfc_llcp_accept_dequeue(struct sock *parent,
}
static int llcp_sock_accept(struct socket *sock, struct socket *newsock,
- int flags)
+ int flags, bool kern)
{
DECLARE_WAITQUEUE(wait, current);
struct sock *sk = sock->sk, *new_sk;
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index ea023b35f1c2..03f3d5c7beb8 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -38,14 +38,7 @@ static const struct genl_multicast_group nfc_genl_mcgrps[] = {
{ .name = NFC_GENL_MCAST_EVENT_NAME, },
};
-static struct genl_family nfc_genl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = NFC_GENL_NAME,
- .version = NFC_GENL_VERSION,
- .maxattr = NFC_ATTR_MAX,
-};
-
+static struct genl_family nfc_genl_family;
static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = {
[NFC_ATTR_DEVICE_INDEX] = { .type = NLA_U32 },
[NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING,
@@ -120,21 +113,20 @@ nla_put_failure:
static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb)
{
+ struct nlattr **attrbuf = genl_family_attrbuf(&nfc_genl_family);
struct nfc_dev *dev;
int rc;
u32 idx;
rc = nlmsg_parse(cb->nlh, GENL_HDRLEN + nfc_genl_family.hdrsize,
- nfc_genl_family.attrbuf,
- nfc_genl_family.maxattr,
- nfc_genl_policy);
+ attrbuf, nfc_genl_family.maxattr, nfc_genl_policy);
if (rc < 0)
return ERR_PTR(rc);
- if (!nfc_genl_family.attrbuf[NFC_ATTR_DEVICE_INDEX])
+ if (!attrbuf[NFC_ATTR_DEVICE_INDEX])
return ERR_PTR(-EINVAL);
- idx = nla_get_u32(nfc_genl_family.attrbuf[NFC_ATTR_DEVICE_INDEX]);
+ idx = nla_get_u32(attrbuf[NFC_ATTR_DEVICE_INDEX]);
dev = nfc_get_device(idx);
if (!dev)
@@ -1754,6 +1746,18 @@ static const struct genl_ops nfc_genl_ops[] = {
},
};
+static struct genl_family nfc_genl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = NFC_GENL_NAME,
+ .version = NFC_GENL_VERSION,
+ .maxattr = NFC_ATTR_MAX,
+ .module = THIS_MODULE,
+ .ops = nfc_genl_ops,
+ .n_ops = ARRAY_SIZE(nfc_genl_ops),
+ .mcgrps = nfc_genl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(nfc_genl_mcgrps),
+};
+
struct urelease_work {
struct work_struct w;
@@ -1839,9 +1843,7 @@ int __init nfc_genl_init(void)
{
int rc;
- rc = genl_register_family_with_ops_groups(&nfc_genl_family,
- nfc_genl_ops,
- nfc_genl_mcgrps);
+ rc = genl_register_family(&nfc_genl_family);
if (rc)
return rc;
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 4e03f64709bc..c82301ce3fff 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -62,9 +62,11 @@ struct ovs_frag_data {
struct vport *vport;
struct ovs_skb_cb cb;
__be16 inner_protocol;
- __u16 vlan_tci;
+ u16 network_offset; /* valid only for MPLS */
+ u16 vlan_tci;
__be16 vlan_proto;
unsigned int l2_len;
+ u8 mac_proto;
u8 l2_data[MAX_L2_LEN];
};
@@ -136,12 +138,12 @@ static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
static void invalidate_flow_key(struct sw_flow_key *key)
{
- key->eth.type = htons(0);
+ key->mac_proto |= SW_FLOW_KEY_INVALID;
}
static bool is_flow_key_valid(const struct sw_flow_key *key)
{
- return !!key->eth.type;
+ return !(key->mac_proto & SW_FLOW_KEY_INVALID);
}
static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
@@ -185,7 +187,8 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
- update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
+ if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET)
+ update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
skb->protocol = mpls->mpls_ethertype;
invalidate_flow_key(key);
@@ -195,7 +198,6 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
const __be16 ethertype)
{
- struct ethhdr *hdr;
int err;
err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
@@ -211,11 +213,15 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
skb_reset_mac_header(skb);
skb_set_network_header(skb, skb->mac_len);
- /* mpls_hdr() is used to locate the ethertype field correctly in the
- * presence of VLAN tags.
- */
- hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
- update_ethertype(skb, hdr, ethertype);
+ if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET) {
+ struct ethhdr *hdr;
+
+ /* mpls_hdr() is used to locate the ethertype field correctly in the
+ * presence of VLAN tags.
+ */
+ hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
+ update_ethertype(skb, hdr, ethertype);
+ }
if (eth_p_mpls(skb->protocol))
skb->protocol = ethertype;
@@ -311,6 +317,47 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
return 0;
}
+/* pop_eth does not support VLAN packets as this action is never called
+ * for them.
+ */
+static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ skb_pull_rcsum(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ /* safe right before invalidate_flow_key */
+ key->mac_proto = MAC_PROTO_NONE;
+ invalidate_flow_key(key);
+ return 0;
+}
+
+static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
+ const struct ovs_action_push_eth *ethh)
+{
+ struct ethhdr *hdr;
+
+ /* Add the new Ethernet header */
+ if (skb_cow_head(skb, ETH_HLEN) < 0)
+ return -ENOMEM;
+
+ skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ hdr = eth_hdr(skb);
+ ether_addr_copy(hdr->h_source, ethh->addresses.eth_src);
+ ether_addr_copy(hdr->h_dest, ethh->addresses.eth_dst);
+ hdr->h_proto = skb->protocol;
+
+ skb_postpush_rcsum(skb, hdr, ETH_HLEN);
+
+ /* safe right before invalidate_flow_key */
+ key->mac_proto = MAC_PROTO_ETHERNET;
+ invalidate_flow_key(key);
+ return 0;
+}
+
static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh,
__be32 addr, __be32 new_addr)
{
@@ -666,7 +713,13 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk
skb_postpush_rcsum(skb, skb->data, data->l2_len);
skb_reset_mac_header(skb);
- ovs_vport_send(vport, skb);
+ if (eth_p_mpls(skb->protocol)) {
+ skb->inner_network_header = skb->network_header;
+ skb_set_network_header(skb, data->network_offset);
+ skb_reset_mac_len(skb);
+ }
+
+ ovs_vport_send(vport, skb, data->mac_proto);
return 0;
}
@@ -684,7 +737,8 @@ static struct dst_ops ovs_dst_ops = {
/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is
* ovs_vport_output(), which is called once per fragmented packet.
*/
-static void prepare_frag(struct vport *vport, struct sk_buff *skb)
+static void prepare_frag(struct vport *vport, struct sk_buff *skb,
+ u16 orig_network_offset, u8 mac_proto)
{
unsigned int hlen = skb_network_offset(skb);
struct ovs_frag_data *data;
@@ -694,8 +748,10 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb)
data->vport = vport;
data->cb = *OVS_CB(skb);
data->inner_protocol = skb->inner_protocol;
+ data->network_offset = orig_network_offset;
data->vlan_tci = skb->vlan_tci;
data->vlan_proto = skb->vlan_proto;
+ data->mac_proto = mac_proto;
data->l2_len = hlen;
memcpy(&data->l2_data, skb->data, hlen);
@@ -704,18 +760,27 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb)
}
static void ovs_fragment(struct net *net, struct vport *vport,
- struct sk_buff *skb, u16 mru, __be16 ethertype)
+ struct sk_buff *skb, u16 mru,
+ struct sw_flow_key *key)
{
+ u16 orig_network_offset = 0;
+
+ if (eth_p_mpls(skb->protocol)) {
+ orig_network_offset = skb_network_offset(skb);
+ skb->network_header = skb->inner_network_header;
+ }
+
if (skb_network_offset(skb) > MAX_L2_LEN) {
OVS_NLERR(1, "L2 header too long to fragment");
goto err;
}
- if (ethertype == htons(ETH_P_IP)) {
+ if (key->eth.type == htons(ETH_P_IP)) {
struct dst_entry ovs_dst;
unsigned long orig_dst;
- prepare_frag(vport, skb);
+ prepare_frag(vport, skb, orig_network_offset,
+ ovs_key_mac_proto(key));
dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1,
DST_OBSOLETE_NONE, DST_NOCOUNT);
ovs_dst.dev = vport->dev;
@@ -726,16 +791,16 @@ static void ovs_fragment(struct net *net, struct vport *vport,
ip_do_fragment(net, skb->sk, skb, ovs_vport_output);
refdst_drop(orig_dst);
- } else if (ethertype == htons(ETH_P_IPV6)) {
+ } else if (key->eth.type == htons(ETH_P_IPV6)) {
const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
unsigned long orig_dst;
struct rt6_info ovs_rt;
- if (!v6ops) {
+ if (!v6ops)
goto err;
- }
- prepare_frag(vport, skb);
+ prepare_frag(vport, skb, orig_network_offset,
+ ovs_key_mac_proto(key));
memset(&ovs_rt, 0, sizeof(ovs_rt));
dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
DST_OBSOLETE_NONE, DST_NOCOUNT);
@@ -749,7 +814,7 @@ static void ovs_fragment(struct net *net, struct vport *vport,
refdst_drop(orig_dst);
} else {
WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.",
- ovs_vport_name(vport), ntohs(ethertype), mru,
+ ovs_vport_name(vport), ntohs(key->eth.type), mru,
vport->dev->mtu);
goto err;
}
@@ -769,26 +834,19 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
u32 cutlen = OVS_CB(skb)->cutlen;
if (unlikely(cutlen > 0)) {
- if (skb->len - cutlen > ETH_HLEN)
+ if (skb->len - cutlen > ovs_mac_header_len(key))
pskb_trim(skb, skb->len - cutlen);
else
- pskb_trim(skb, ETH_HLEN);
+ pskb_trim(skb, ovs_mac_header_len(key));
}
- if (likely(!mru || (skb->len <= mru + ETH_HLEN))) {
- ovs_vport_send(vport, skb);
+ if (likely(!mru ||
+ (skb->len <= mru + vport->dev->hard_header_len))) {
+ ovs_vport_send(vport, skb, ovs_key_mac_proto(key));
} else if (mru <= vport->dev->mtu) {
struct net *net = read_pnet(&dp->net);
- __be16 ethertype = key->eth.type;
- if (!is_flow_key_valid(key)) {
- if (eth_p_mpls(skb->protocol))
- ethertype = skb->inner_protocol;
- else
- ethertype = vlan_get_protocol(skb);
- }
-
- ovs_fragment(net, vport, skb, mru, ethertype);
+ ovs_fragment(net, vport, skb, mru, key);
} else {
kfree_skb(skb);
}
@@ -1015,6 +1073,8 @@ static int execute_masked_set_action(struct sk_buff *skb,
case OVS_KEY_ATTR_CT_ZONE:
case OVS_KEY_ATTR_CT_MARK:
case OVS_KEY_ATTR_CT_LABELS:
+ case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4:
+ case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6:
err = -EINVAL;
break;
}
@@ -1082,12 +1142,6 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key,
const struct nlattr *attr, int len)
{
- /* Every output action needs a separate clone of 'skb', but the common
- * case is just a single output action, so that doing a clone and
- * then freeing the original skbuff is wasteful. So the following code
- * is slightly obscure just to avoid that.
- */
- int prev_port = -1;
const struct nlattr *a;
int rem;
@@ -1095,20 +1149,28 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
a = nla_next(a, &rem)) {
int err = 0;
- if (unlikely(prev_port != -1)) {
- struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
-
- if (out_skb)
- do_output(dp, out_skb, prev_port, key);
+ switch (nla_type(a)) {
+ case OVS_ACTION_ATTR_OUTPUT: {
+ int port = nla_get_u32(a);
+ struct sk_buff *clone;
+
+ /* Every output action needs a separate clone
+ * of 'skb', In case the output action is the
+ * last action, cloning can be avoided.
+ */
+ if (nla_is_last(a, rem)) {
+ do_output(dp, skb, port, key);
+ /* 'skb' has been used for output.
+ */
+ return 0;
+ }
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (clone)
+ do_output(dp, clone, port, key);
OVS_CB(skb)->cutlen = 0;
- prev_port = -1;
- }
-
- switch (nla_type(a)) {
- case OVS_ACTION_ATTR_OUTPUT:
- prev_port = nla_get_u32(a);
break;
+ }
case OVS_ACTION_ATTR_TRUNC: {
struct ovs_action_trunc *trunc = nla_data(a);
@@ -1182,6 +1244,14 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
if (err)
return err == -EINPROGRESS ? 0 : err;
break;
+
+ case OVS_ACTION_ATTR_PUSH_ETH:
+ err = push_eth(skb, key, nla_data(a));
+ break;
+
+ case OVS_ACTION_ATTR_POP_ETH:
+ err = pop_eth(skb, key);
+ break;
}
if (unlikely(err)) {
@@ -1190,11 +1260,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
}
}
- if (prev_port != -1)
- do_output(dp, skb, prev_port, key);
- else
- consume_skb(skb);
-
+ consume_skb(skb);
return 0;
}
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index fecefa2dc94e..7b2c2fce408a 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -65,6 +65,7 @@ struct ovs_conntrack_info {
struct nf_conn *ct;
u8 commit : 1;
u8 nat : 3; /* enum ovs_ct_nat */
+ u8 force : 1;
u16 family;
struct md_mark mark;
struct md_labels labels;
@@ -73,6 +74,8 @@ struct ovs_conntrack_info {
#endif
};
+static bool labels_nonzero(const struct ovs_key_ct_labels *labels);
+
static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
static u16 key_to_nfproto(const struct sw_flow_key *key)
@@ -129,21 +132,33 @@ static u32 ovs_ct_get_mark(const struct nf_conn *ct)
#endif
}
+/* Guard against conntrack labels max size shrinking below 128 bits. */
+#if NF_CT_LABELS_MAX_SIZE < 16
+#error NF_CT_LABELS_MAX_SIZE must be at least 16 bytes
+#endif
+
static void ovs_ct_get_labels(const struct nf_conn *ct,
struct ovs_key_ct_labels *labels)
{
struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL;
- if (cl) {
- size_t len = sizeof(cl->bits);
+ if (cl)
+ memcpy(labels, cl->bits, OVS_CT_LABELS_LEN);
+ else
+ memset(labels, 0, OVS_CT_LABELS_LEN);
+}
- if (len > OVS_CT_LABELS_LEN)
- len = OVS_CT_LABELS_LEN;
- else if (len < OVS_CT_LABELS_LEN)
- memset(labels, 0, OVS_CT_LABELS_LEN);
- memcpy(labels, cl->bits, len);
+static void __ovs_ct_update_key_orig_tp(struct sw_flow_key *key,
+ const struct nf_conntrack_tuple *orig,
+ u8 icmp_proto)
+{
+ key->ct_orig_proto = orig->dst.protonum;
+ if (orig->dst.protonum == icmp_proto) {
+ key->ct.orig_tp.src = htons(orig->dst.u.icmp.type);
+ key->ct.orig_tp.dst = htons(orig->dst.u.icmp.code);
} else {
- memset(labels, 0, OVS_CT_LABELS_LEN);
+ key->ct.orig_tp.src = orig->src.u.all;
+ key->ct.orig_tp.dst = orig->dst.u.all;
}
}
@@ -151,13 +166,42 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
const struct nf_conntrack_zone *zone,
const struct nf_conn *ct)
{
- key->ct.state = state;
- key->ct.zone = zone->id;
+ key->ct_state = state;
+ key->ct_zone = zone->id;
key->ct.mark = ovs_ct_get_mark(ct);
ovs_ct_get_labels(ct, &key->ct.labels);
+
+ if (ct) {
+ const struct nf_conntrack_tuple *orig;
+
+ /* Use the master if we have one. */
+ if (ct->master)
+ ct = ct->master;
+ orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+
+ /* IP version must match with the master connection. */
+ if (key->eth.type == htons(ETH_P_IP) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4) {
+ key->ipv4.ct_orig.src = orig->src.u3.ip;
+ key->ipv4.ct_orig.dst = orig->dst.u3.ip;
+ __ovs_ct_update_key_orig_tp(key, orig, IPPROTO_ICMP);
+ return;
+ } else if (key->eth.type == htons(ETH_P_IPV6) &&
+ !sw_flow_key_is_nd(key) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV6) {
+ key->ipv6.ct_orig.src = orig->src.u3.in6;
+ key->ipv6.ct_orig.dst = orig->dst.u3.in6;
+ __ovs_ct_update_key_orig_tp(key, orig, NEXTHDR_ICMP);
+ return;
+ }
+ }
+ /* Clear 'ct_orig_proto' to mark the non-existence of conntrack
+ * original direction key fields.
+ */
+ key->ct_orig_proto = 0;
}
-/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
+/* Update 'key' based on skb->_nfct. If 'post_ct' is true, then OVS has
* previously sent the packet to conntrack via the ct action. If
* 'keep_nat_flags' is true, the existing NAT flags retained, else they are
* initialized from the connection status.
@@ -184,7 +228,7 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
if (ct->master)
state |= OVS_CS_F_RELATED;
if (keep_nat_flags) {
- state |= key->ct.state & OVS_CS_F_NAT_MASK;
+ state |= key->ct_state & OVS_CS_F_NAT_MASK;
} else {
if (ct->status & IPS_SRC_NAT)
state |= OVS_CS_F_SRC_NAT;
@@ -208,44 +252,69 @@ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
ovs_ct_update_key(skb, NULL, key, false, false);
}
-int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
+#define IN6_ADDR_INITIALIZER(ADDR) \
+ { (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \
+ (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] }
+
+int ovs_ct_put_key(const struct sw_flow_key *swkey,
+ const struct sw_flow_key *output, struct sk_buff *skb)
{
- if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state))
+ if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, output->ct_state))
return -EMSGSIZE;
if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
- nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone))
+ nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, output->ct_zone))
return -EMSGSIZE;
if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
- nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark))
+ nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, output->ct.mark))
return -EMSGSIZE;
if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
- nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels),
- &key->ct.labels))
+ nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(output->ct.labels),
+ &output->ct.labels))
return -EMSGSIZE;
+ if (swkey->ct_orig_proto) {
+ if (swkey->eth.type == htons(ETH_P_IP)) {
+ struct ovs_key_ct_tuple_ipv4 orig = {
+ output->ipv4.ct_orig.src,
+ output->ipv4.ct_orig.dst,
+ output->ct.orig_tp.src,
+ output->ct.orig_tp.dst,
+ output->ct_orig_proto,
+ };
+ if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,
+ sizeof(orig), &orig))
+ return -EMSGSIZE;
+ } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+ struct ovs_key_ct_tuple_ipv6 orig = {
+ IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src),
+ IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst),
+ output->ct.orig_tp.src,
+ output->ct.orig_tp.dst,
+ output->ct_orig_proto,
+ };
+ if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,
+ sizeof(orig), &orig))
+ return -EMSGSIZE;
+ }
+ }
+
return 0;
}
-static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
+static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key,
u32 ct_mark, u32 mask)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct;
u32 new_mark;
- /* The connection could be invalid, in which case set_mark is no-op. */
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct)
- return 0;
-
new_mark = ct_mark | (ct->mark & ~(mask));
if (ct->mark != new_mark) {
ct->mark = new_mark;
- nf_conntrack_event_cache(IPCT_MARK, ct);
+ if (nf_ct_is_confirmed(ct))
+ nf_conntrack_event_cache(IPCT_MARK, ct);
key->ct.mark = new_mark;
}
@@ -255,34 +324,83 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
#endif
}
-static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key,
- const struct ovs_key_ct_labels *labels,
- const struct ovs_key_ct_labels *mask)
+static struct nf_conn_labels *ovs_ct_get_conn_labels(struct nf_conn *ct)
{
- enum ip_conntrack_info ctinfo;
struct nf_conn_labels *cl;
- struct nf_conn *ct;
- int err;
-
- /* The connection could be invalid, in which case set_label is no-op.*/
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct)
- return 0;
cl = nf_ct_labels_find(ct);
if (!cl) {
nf_ct_labels_ext_add(ct);
cl = nf_ct_labels_find(ct);
}
- if (!cl || sizeof(cl->bits) < OVS_CT_LABELS_LEN)
+
+ return cl;
+}
+
+/* Initialize labels for a new, yet to be committed conntrack entry. Note that
+ * since the new connection is not yet confirmed, and thus no-one else has
+ * access to it's labels, we simply write them over.
+ */
+static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key,
+ const struct ovs_key_ct_labels *labels,
+ const struct ovs_key_ct_labels *mask)
+{
+ struct nf_conn_labels *cl, *master_cl;
+ bool have_mask = labels_nonzero(mask);
+
+ /* Inherit master's labels to the related connection? */
+ master_cl = ct->master ? nf_ct_labels_find(ct->master) : NULL;
+
+ if (!master_cl && !have_mask)
+ return 0; /* Nothing to do. */
+
+ cl = ovs_ct_get_conn_labels(ct);
+ if (!cl)
return -ENOSPC;
- err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask,
- OVS_CT_LABELS_LEN / sizeof(u32));
+ /* Inherit the master's labels, if any. */
+ if (master_cl)
+ *cl = *master_cl;
+
+ if (have_mask) {
+ u32 *dst = (u32 *)cl->bits;
+ int i;
+
+ for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
+ dst[i] = (dst[i] & ~mask->ct_labels_32[i]) |
+ (labels->ct_labels_32[i]
+ & mask->ct_labels_32[i]);
+ }
+
+ /* Labels are included in the IPCTNL_MSG_CT_NEW event only if the
+ * IPCT_LABEL bit it set in the event cache.
+ */
+ nf_conntrack_event_cache(IPCT_LABEL, ct);
+
+ memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
+
+ return 0;
+}
+
+static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key,
+ const struct ovs_key_ct_labels *labels,
+ const struct ovs_key_ct_labels *mask)
+{
+ struct nf_conn_labels *cl;
+ int err;
+
+ cl = ovs_ct_get_conn_labels(ct);
+ if (!cl)
+ return -ENOSPC;
+
+ err = nf_connlabels_replace(ct, labels->ct_labels_32,
+ mask->ct_labels_32,
+ OVS_CT_LABELS_LEN_32);
if (err)
return err;
- ovs_ct_get_labels(ct, &key->ct.labels);
+ memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
+
return 0;
}
@@ -367,7 +485,6 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
} else if (key->eth.type == htons(ETH_P_IPV6)) {
enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
- skb_orphan(skb);
memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
err = nf_ct_frag6_gather(net, skb, user);
if (err) {
@@ -421,16 +538,16 @@ ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
/* Find an existing connection which this packet belongs to without
* re-attributing statistics or modifying the connection state. This allows an
- * skb->nfct lost due to an upcall to be recovered during actions execution.
+ * skb->_nfct lost due to an upcall to be recovered during actions execution.
*
* Must be called with rcu_read_lock.
*
- * On success, populates skb->nfct and skb->nfctinfo, and returns the
- * connection. Returns NULL if there is no existing entry.
+ * On success, populates skb->_nfct and returns the connection. Returns NULL
+ * if there is no existing entry.
*/
static struct nf_conn *
ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
- u8 l3num, struct sk_buff *skb)
+ u8 l3num, struct sk_buff *skb, bool natted)
{
struct nf_conntrack_l3proto *l3proto;
struct nf_conntrack_l4proto *l4proto;
@@ -453,6 +570,17 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
return NULL;
}
+ /* Must invert the tuple if skb has been transformed by NAT. */
+ if (natted) {
+ struct nf_conntrack_tuple inverse;
+
+ if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) {
+ pr_debug("ovs_ct_find_existing: Inversion failed!\n");
+ return NULL;
+ }
+ tuple = inverse;
+ }
+
/* look for tuple match */
h = nf_conntrack_find_get(net, zone, &tuple);
if (!h)
@@ -460,12 +588,18 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
ct = nf_ct_tuplehash_to_ctrack(h);
- skb->nfct = &ct->ct_general;
- skb->nfctinfo = ovs_ct_get_info(h);
+ /* Inverted packet tuple matches the reverse direction conntrack tuple,
+ * select the other tuplehash to get the right 'ctinfo' bits for this
+ * packet.
+ */
+ if (natted)
+ h = &ct->tuplehash[!h->tuple.dst.dir];
+
+ nf_ct_set(skb, ct, ovs_ct_get_info(h));
return ct;
}
-/* Determine whether skb->nfct is equal to the result of conntrack lookup. */
+/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
static bool skb_nfct_cached(struct net *net,
const struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
@@ -476,14 +610,19 @@ static bool skb_nfct_cached(struct net *net,
ct = nf_ct_get(skb, &ctinfo);
/* If no ct, check if we have evidence that an existing conntrack entry
- * might be found for this skb. This happens when we lose a skb->nfct
+ * might be found for this skb. This happens when we lose a skb->_nfct
* due to an upcall. If the connection was not confirmed, it is not
* cached and needs to be run through conntrack again.
*/
- if (!ct && key->ct.state & OVS_CS_F_TRACKED &&
- !(key->ct.state & OVS_CS_F_INVALID) &&
- key->ct.zone == info->zone.id)
- ct = ovs_ct_find_existing(net, &info->zone, info->family, skb);
+ if (!ct && key->ct_state & OVS_CS_F_TRACKED &&
+ !(key->ct_state & OVS_CS_F_INVALID) &&
+ key->ct_zone == info->zone.id) {
+ ct = ovs_ct_find_existing(net, &info->zone, info->family, skb,
+ !!(key->ct_state
+ & OVS_CS_F_NAT_MASK));
+ if (ct)
+ nf_ct_get(skb, &ctinfo);
+ }
if (!ct)
return false;
if (!net_eq(net, read_pnet(&ct->ct_net)))
@@ -497,6 +636,18 @@ static bool skb_nfct_cached(struct net *net,
if (help && rcu_access_pointer(help->helper) != info->helper)
return false;
}
+ /* Force conntrack entry direction to the current packet? */
+ if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
+ /* Delete the conntrack entry if confirmed, else just release
+ * the reference.
+ */
+ if (nf_ct_is_confirmed(ct))
+ nf_ct_delete(ct, 0, 0);
+
+ nf_conntrack_put(&ct->ct_general);
+ nf_ct_set(skb, NULL, 0);
+ return false;
+ }
return true;
}
@@ -514,7 +665,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
int hooknum, nh_off, err = NF_ACCEPT;
nh_off = skb_network_offset(skb);
- skb_pull(skb, nh_off);
+ skb_pull_rcsum(skb, nh_off);
/* See HOOK2MANIP(). */
if (maniptype == NF_NAT_MANIP_SRC)
@@ -579,6 +730,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
err = nf_nat_packet(ct, ctinfo, hooknum, skb);
push:
skb_push(skb, nh_off);
+ skb_postpush_rcsum(skb, skb->data, nh_off);
return err;
}
@@ -590,7 +742,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key,
if (maniptype == NF_NAT_MANIP_SRC) {
__be16 src;
- key->ct.state |= OVS_CS_F_SRC_NAT;
+ key->ct_state |= OVS_CS_F_SRC_NAT;
if (key->eth.type == htons(ETH_P_IP))
key->ipv4.addr.src = ip_hdr(skb)->saddr;
else if (key->eth.type == htons(ETH_P_IPV6))
@@ -612,7 +764,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key,
} else {
__be16 dst;
- key->ct.state |= OVS_CS_F_DST_NAT;
+ key->ct_state |= OVS_CS_F_DST_NAT;
if (key->eth.type == htons(ETH_P_IP))
key->ipv4.addr.dst = ip_hdr(skb)->daddr;
else if (key->eth.type == htons(ETH_P_IPV6))
@@ -699,7 +851,7 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
* not done already. Update key with new CT state after passing the packet
* through conntrack.
- * Note that if the packet is deemed invalid by conntrack, skb->nfct will be
+ * Note that if the packet is deemed invalid by conntrack, skb->_nfct will be
* set to NULL and 0 will be returned.
*/
static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
@@ -721,19 +873,14 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
/* Associate skb with specified zone. */
if (tmpl) {
- if (skb->nfct)
- nf_conntrack_put(skb->nfct);
+ if (skb_nfct(skb))
+ nf_conntrack_put(skb_nfct(skb));
nf_conntrack_get(&tmpl->ct_general);
- skb->nfct = &tmpl->ct_general;
- skb->nfctinfo = IP_CT_NEW;
+ nf_ct_set(skb, tmpl, IP_CT_NEW);
}
- /* Repeat if requested, see nf_iterate(). */
- do {
- err = nf_conntrack_in(net, info->family,
- NF_INET_PRE_ROUTING, skb);
- } while (err == NF_REPEAT);
-
+ err = nf_conntrack_in(net, info->family,
+ NF_INET_PRE_ROUTING, skb);
if (err != NF_ACCEPT)
return -ENOENT;
@@ -741,7 +888,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
* NAT after the nf_conntrack_in() call. We can actually clear
* the whole state, as it will be re-initialized below.
*/
- key->ct.state = 0;
+ key->ct_state = 0;
/* Update the key, but keep the NAT flags. */
ovs_ct_update_key(skb, info, key, true, true);
@@ -757,9 +904,9 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
*
* NAT will be done only if the CT action has NAT, and only
* once per packet (per zone), as guarded by the NAT bits in
- * the key->ct.state.
+ * the key->ct_state.
*/
- if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) &&
+ if (info->nat && !(key->ct_state & OVS_CS_F_NAT_MASK) &&
(nf_ct_is_confirmed(ct) || info->commit) &&
ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
return -EINVAL;
@@ -823,7 +970,7 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
if (err)
return err;
- ct = (struct nf_conn *)skb->nfct;
+ ct = (struct nf_conn *)skb_nfct(skb);
if (ct)
nf_ct_deliver_cached_events(ct);
}
@@ -835,8 +982,8 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
{
size_t i;
- for (i = 0; i < sizeof(*labels); i++)
- if (labels->ct_labels[i])
+ for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
+ if (labels->ct_labels_32[i])
return true;
return false;
@@ -847,24 +994,36 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
struct sk_buff *skb)
{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
int err;
err = __ovs_ct_lookup(net, key, info, skb);
if (err)
return err;
+ /* The connection could be invalid, in which case this is a no-op.*/
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return 0;
+
/* Apply changes before confirming the connection so that the initial
* conntrack NEW netlink event carries the values given in the CT
* action.
*/
if (info->mark.mask) {
- err = ovs_ct_set_mark(skb, key, info->mark.value,
+ err = ovs_ct_set_mark(ct, key, info->mark.value,
info->mark.mask);
if (err)
return err;
}
- if (labels_nonzero(&info->labels.mask)) {
- err = ovs_ct_set_labels(skb, key, &info->labels.value,
+ if (!nf_ct_is_confirmed(ct)) {
+ err = ovs_ct_init_labels(ct, key, &info->labels.value,
+ &info->labels.mask);
+ if (err)
+ return err;
+ } else if (labels_nonzero(&info->labels.mask)) {
+ err = ovs_ct_set_labels(ct, key, &info->labels.value,
&info->labels.mask);
if (err)
return err;
@@ -890,7 +1049,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
/* The conntrack module expects to be working at L3. */
nh_ofs = skb_network_offset(skb);
- skb_pull(skb, nh_ofs);
+ skb_pull_rcsum(skb, nh_ofs);
if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
err = handle_fragments(net, key, info->zone.id, skb);
@@ -904,6 +1063,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
err = ovs_ct_lookup(net, key, info, skb);
skb_push(skb, nh_ofs);
+ skb_postpush_rcsum(skb, skb->data, nh_ofs);
if (err)
kfree_skb(skb);
return err;
@@ -1065,6 +1225,7 @@ static int parse_nat(const struct nlattr *attr,
static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
[OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 },
+ [OVS_CT_ATTR_FORCE_COMMIT] = { .minlen = 0, .maxlen = 0 },
[OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16),
.maxlen = sizeof(u16) },
[OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark),
@@ -1104,6 +1265,9 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
}
switch (type) {
+ case OVS_CT_ATTR_FORCE_COMMIT:
+ info->force = true;
+ /* fall through. */
case OVS_CT_ATTR_COMMIT:
info->commit = true;
break;
@@ -1330,7 +1494,9 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
if (!start)
return -EMSGSIZE;
- if (ct_info->commit && nla_put_flag(skb, OVS_CT_ATTR_COMMIT))
+ if (ct_info->commit && nla_put_flag(skb, ct_info->force
+ ? OVS_CT_ATTR_FORCE_COMMIT
+ : OVS_CT_ATTR_COMMIT))
return -EMSGSIZE;
if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id))
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 8f6230bd6183..bc7efd1867ab 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -32,7 +32,8 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
const struct ovs_conntrack_info *);
void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
-int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb);
+int ovs_ct_put_key(const struct sw_flow_key *swkey,
+ const struct sw_flow_key *output, struct sk_buff *skb);
void ovs_ct_free_action(const struct nlattr *a);
#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
@@ -75,13 +76,18 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb,
static inline void ovs_ct_fill_key(const struct sk_buff *skb,
struct sw_flow_key *key)
{
- key->ct.state = 0;
- key->ct.zone = 0;
+ key->ct_state = 0;
+ key->ct_zone = 0;
key->ct.mark = 0;
memset(&key->ct.labels, 0, sizeof(key->ct.labels));
+ /* Clear 'ct_orig_proto' to mark the non-existence of original
+ * direction key fields.
+ */
+ key->ct_orig_proto = 0;
}
-static inline int ovs_ct_put_key(const struct sw_flow_key *key,
+static inline int ovs_ct_put_key(const struct sw_flow_key *swkey,
+ const struct sw_flow_key *output,
struct sk_buff *skb)
{
return 0;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 4d67ea856067..9c62b6325f7a 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -58,8 +58,7 @@
#include "vport-internal_dev.h"
#include "vport-netdev.h"
-int ovs_net_id __read_mostly;
-EXPORT_SYMBOL_GPL(ovs_net_id);
+unsigned int ovs_net_id __read_mostly;
static struct genl_family dp_packet_genl_family;
static struct genl_family dp_flow_genl_family;
@@ -131,7 +130,6 @@ int lockdep_ovsl_is_held(void)
else
return 1;
}
-EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held);
#endif
static struct vport *new_vport(const struct vport_parms *);
@@ -562,7 +560,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
struct sw_flow *flow;
struct sw_flow_actions *sf_acts;
struct datapath *dp;
- struct ethhdr *eth;
struct vport *input_vport;
u16 mru = 0;
int len;
@@ -583,17 +580,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len);
- skb_reset_mac_header(packet);
- eth = eth_hdr(packet);
-
- /* Normally, setting the skb 'protocol' field would be handled by a
- * call to eth_type_trans(), but it assumes there's a sending
- * device, which we may not have. */
- if (eth_proto_is_802_3(eth->h_proto))
- packet->protocol = eth->h_proto;
- else
- packet->protocol = htons(ETH_P_802_2);
-
/* Set packet's mru */
if (a[OVS_PACKET_ATTR_MRU]) {
mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]);
@@ -672,8 +658,7 @@ static const struct genl_ops dp_packet_genl_ops[] = {
}
};
-static struct genl_family dp_packet_genl_family = {
- .id = GENL_ID_GENERATE,
+static struct genl_family dp_packet_genl_family __ro_after_init = {
.hdrsize = sizeof(struct ovs_header),
.name = OVS_PACKET_FAMILY,
.version = OVS_PACKET_VERSION,
@@ -682,6 +667,7 @@ static struct genl_family dp_packet_genl_family = {
.parallel_ops = true,
.ops = dp_packet_genl_ops,
.n_ops = ARRAY_SIZE(dp_packet_genl_ops),
+ .module = THIS_MODULE,
};
static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
@@ -1437,8 +1423,7 @@ static const struct genl_ops dp_flow_genl_ops[] = {
},
};
-static struct genl_family dp_flow_genl_family = {
- .id = GENL_ID_GENERATE,
+static struct genl_family dp_flow_genl_family __ro_after_init = {
.hdrsize = sizeof(struct ovs_header),
.name = OVS_FLOW_FAMILY,
.version = OVS_FLOW_VERSION,
@@ -1449,6 +1434,7 @@ static struct genl_family dp_flow_genl_family = {
.n_ops = ARRAY_SIZE(dp_flow_genl_ops),
.mcgrps = &ovs_dp_flow_multicast_group,
.n_mcgrps = 1,
+ .module = THIS_MODULE,
};
static size_t ovs_dp_cmd_msg_size(void)
@@ -1823,8 +1809,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = {
},
};
-static struct genl_family dp_datapath_genl_family = {
- .id = GENL_ID_GENERATE,
+static struct genl_family dp_datapath_genl_family __ro_after_init = {
.hdrsize = sizeof(struct ovs_header),
.name = OVS_DATAPATH_FAMILY,
.version = OVS_DATAPATH_VERSION,
@@ -1835,6 +1820,7 @@ static struct genl_family dp_datapath_genl_family = {
.n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
.mcgrps = &ovs_dp_datapath_multicast_group,
.n_mcgrps = 1,
+ .module = THIS_MODULE,
};
/* Called with ovs_mutex or RCU read lock. */
@@ -2245,8 +2231,7 @@ static const struct genl_ops dp_vport_genl_ops[] = {
},
};
-struct genl_family dp_vport_genl_family = {
- .id = GENL_ID_GENERATE,
+struct genl_family dp_vport_genl_family __ro_after_init = {
.hdrsize = sizeof(struct ovs_header),
.name = OVS_VPORT_FAMILY,
.version = OVS_VPORT_VERSION,
@@ -2257,6 +2242,7 @@ struct genl_family dp_vport_genl_family = {
.n_ops = ARRAY_SIZE(dp_vport_genl_ops),
.mcgrps = &ovs_dp_vport_multicast_group,
.n_mcgrps = 1,
+ .module = THIS_MODULE,
};
static struct genl_family * const dp_genl_families[] = {
@@ -2274,7 +2260,7 @@ static void dp_unregister_genl(int n_families)
genl_unregister_family(dp_genl_families[i]);
}
-static int dp_register_genl(void)
+static int __init dp_register_genl(void)
{
int err;
int i;
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index ab85c1cae255..1c6e9377436d 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -144,7 +144,7 @@ struct ovs_net {
bool xt_label;
};
-extern int ovs_net_id;
+extern unsigned int ovs_net_id;
void ovs_lock(void);
void ovs_unlock(void);
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 22087062bd10..3f76cb765e5b 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -312,7 +312,8 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
* Returns 0 if it encounters a non-vlan or incomplete packet.
* Returns 1 after successfully parsing vlan tag.
*/
-static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh)
+static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh,
+ bool untag_vlan)
{
struct vlan_head *vh = (struct vlan_head *)skb->data;
@@ -330,31 +331,47 @@ static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh)
key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT);
key_vh->tpid = vh->tpid;
- __skb_pull(skb, sizeof(struct vlan_head));
+ if (unlikely(untag_vlan)) {
+ int offset = skb->data - skb_mac_header(skb);
+ u16 tci;
+ int err;
+
+ __skb_push(skb, offset);
+ err = __skb_vlan_pop(skb, &tci);
+ __skb_pull(skb, offset);
+ if (err)
+ return err;
+ __vlan_hwaccel_put_tag(skb, key_vh->tpid, tci);
+ } else {
+ __skb_pull(skb, sizeof(struct vlan_head));
+ }
return 1;
}
-static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
+static void clear_vlan(struct sw_flow_key *key)
{
- int res;
-
key->eth.vlan.tci = 0;
key->eth.vlan.tpid = 0;
key->eth.cvlan.tci = 0;
key->eth.cvlan.tpid = 0;
+}
+
+static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ int res;
if (skb_vlan_tag_present(skb)) {
key->eth.vlan.tci = htons(skb->vlan_tci);
key->eth.vlan.tpid = skb->vlan_proto;
} else {
/* Parse outer vlan tag in the non-accelerated case. */
- res = parse_vlan_tag(skb, &key->eth.vlan);
+ res = parse_vlan_tag(skb, &key->eth.vlan, true);
if (res <= 0)
return res;
}
/* Parse inner vlan tag. */
- res = parse_vlan_tag(skb, &key->eth.cvlan);
+ res = parse_vlan_tag(skb, &key->eth.cvlan, false);
if (res <= 0)
return res;
@@ -483,17 +500,20 @@ invalid:
*
* Returns 0 if successful, otherwise a negative errno value.
*
- * Initializes @skb header pointers as follows:
+ * Initializes @skb header fields as follows:
*
- * - skb->mac_header: the Ethernet header.
+ * - skb->mac_header: the L2 header.
*
- * - skb->network_header: just past the Ethernet header, or just past the
- * VLAN header, to the first byte of the Ethernet payload.
+ * - skb->network_header: just past the L2 header, or just past the
+ * VLAN header, to the first byte of the L2 payload.
*
* - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6
* on output, then just past the IP header, if one is present and
* of a correct length, otherwise the same as skb->network_header.
* For other key->eth.type values it is left untouched.
+ *
+ * - skb->protocol: the type of the data starting at skb->network_header.
+ * Equals to key->eth.type.
*/
static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
{
@@ -505,28 +525,35 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
skb_reset_mac_header(skb);
- /* Link layer. We are guaranteed to have at least the 14 byte Ethernet
- * header in the linear data area.
- */
- eth = eth_hdr(skb);
- ether_addr_copy(key->eth.src, eth->h_source);
- ether_addr_copy(key->eth.dst, eth->h_dest);
+ /* Link layer. */
+ clear_vlan(key);
+ if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) {
+ if (unlikely(eth_type_vlan(skb->protocol)))
+ return -EINVAL;
- __skb_pull(skb, 2 * ETH_ALEN);
- /* We are going to push all headers that we pull, so no need to
- * update skb->csum here.
- */
+ skb_reset_network_header(skb);
+ } else {
+ eth = eth_hdr(skb);
+ ether_addr_copy(key->eth.src, eth->h_source);
+ ether_addr_copy(key->eth.dst, eth->h_dest);
- if (unlikely(parse_vlan(skb, key)))
- return -ENOMEM;
+ __skb_pull(skb, 2 * ETH_ALEN);
+ /* We are going to push all headers that we pull, so no need to
+ * update skb->csum here.
+ */
- key->eth.type = parse_ethertype(skb);
- if (unlikely(key->eth.type == htons(0)))
- return -ENOMEM;
+ if (unlikely(parse_vlan(skb, key)))
+ return -ENOMEM;
+
+ skb->protocol = parse_ethertype(skb);
+ if (unlikely(skb->protocol == htons(0)))
+ return -ENOMEM;
- skb_reset_network_header(skb);
+ skb_reset_network_header(skb);
+ __skb_push(skb, skb->data - skb_mac_header(skb));
+ }
skb_reset_mac_len(skb);
- __skb_push(skb, skb->data - skb_mac_header(skb));
+ key->eth.type = skb->protocol;
/* Network layer. */
if (key->eth.type == htons(ETH_P_IP)) {
@@ -718,12 +745,34 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
{
- return key_extract(skb, key);
+ int res;
+
+ res = key_extract(skb, key);
+ if (!res)
+ key->mac_proto &= ~SW_FLOW_KEY_INVALID;
+
+ return res;
+}
+
+static int key_extract_mac_proto(struct sk_buff *skb)
+{
+ switch (skb->dev->type) {
+ case ARPHRD_ETHER:
+ return MAC_PROTO_ETHERNET;
+ case ARPHRD_NONE:
+ if (skb->protocol == htons(ETH_P_TEB))
+ return MAC_PROTO_ETHERNET;
+ return MAC_PROTO_NONE;
+ }
+ WARN_ON_ONCE(1);
+ return -EINVAL;
}
int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
struct sk_buff *skb, struct sw_flow_key *key)
{
+ int res, err;
+
/* Extract metadata from packet. */
if (tun_info) {
key->tun_proto = ip_tunnel_info_af(tun_info);
@@ -749,23 +798,61 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
key->phy.priority = skb->priority;
key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
key->phy.skb_mark = skb->mark;
- ovs_ct_fill_key(skb, key);
key->ovs_flow_hash = 0;
+ res = key_extract_mac_proto(skb);
+ if (res < 0)
+ return res;
+ key->mac_proto = res;
key->recirc_id = 0;
- return key_extract(skb, key);
+ err = key_extract(skb, key);
+ if (!err)
+ ovs_ct_fill_key(skb, key); /* Must be after key_extract(). */
+ return err;
}
int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
struct sk_buff *skb,
struct sw_flow_key *key, bool log)
{
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+ u64 attrs = 0;
int err;
+ err = parse_flow_nlattrs(attr, a, &attrs, log);
+ if (err)
+ return -EINVAL;
+
/* Extract metadata from netlink attributes. */
- err = ovs_nla_get_flow_metadata(net, attr, key, log);
+ err = ovs_nla_get_flow_metadata(net, a, attrs, key, log);
if (err)
return err;
- return key_extract(skb, key);
+ /* key_extract assumes that skb->protocol is set-up for
+ * layer 3 packets which is the case for other callers,
+ * in particular packets received from the network stack.
+ * Here the correct value can be set from the metadata
+ * extracted above.
+ * For L2 packet key eth type would be zero. skb protocol
+ * would be set to correct value later during key-extact.
+ */
+
+ skb->protocol = key->eth.type;
+ err = key_extract(skb, key);
+ if (err)
+ return err;
+
+ /* Check that we have conntrack original direction tuple metadata only
+ * for packets for which it makes sense. Otherwise the key may be
+ * corrupted due to overlapping key fields.
+ */
+ if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) &&
+ key->eth.type != htons(ETH_P_IP))
+ return -EINVAL;
+ if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) &&
+ (key->eth.type != htons(ETH_P_IPV6) ||
+ sw_flow_key_is_nd(key)))
+ return -EINVAL;
+
+ return 0;
}
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index ae783f5c6695..a9bc1c875965 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2017 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -37,6 +37,12 @@
struct sk_buff;
+enum sw_flow_mac_proto {
+ MAC_PROTO_NONE = 0,
+ MAC_PROTO_ETHERNET,
+};
+#define SW_FLOW_KEY_INVALID 0x80
+
/* Store options at the end of the array if they are less than the
* maximum size. This allows us to get the benefits of variable length
* matching for small options.
@@ -68,6 +74,7 @@ struct sw_flow_key {
u32 skb_mark; /* SKB mark. */
u16 in_port; /* Input switch port (or DP_MAX_PORTS). */
} __packed phy; /* Safe when right after 'tun_key'. */
+ u8 mac_proto; /* MAC layer protocol (e.g. Ethernet). */
u8 tun_proto; /* Protocol of encapsulating tunnel. */
u32 ovs_flow_hash; /* Datapath computed hash value. */
u32 recirc_id; /* Recirculation ID. */
@@ -78,6 +85,11 @@ struct sw_flow_key {
struct vlan_head cvlan;
__be16 type; /* Ethernet frame type. */
} eth;
+ /* Filling a hole of two bytes. */
+ u8 ct_state;
+ u8 ct_orig_proto; /* CT original direction tuple IP
+ * protocol.
+ */
union {
struct {
__be32 top_lse; /* top label stack entry */
@@ -89,6 +101,7 @@ struct sw_flow_key {
u8 frag; /* One of OVS_FRAG_TYPE_*. */
} ip;
};
+ u16 ct_zone; /* Conntrack zone. */
struct {
__be16 src; /* TCP/UDP/SCTP source port. */
__be16 dst; /* TCP/UDP/SCTP destination port. */
@@ -100,10 +113,16 @@ struct sw_flow_key {
__be32 src; /* IP source address. */
__be32 dst; /* IP destination address. */
} addr;
- struct {
- u8 sha[ETH_ALEN]; /* ARP source hardware address. */
- u8 tha[ETH_ALEN]; /* ARP target hardware address. */
- } arp;
+ union {
+ struct {
+ __be32 src;
+ __be32 dst;
+ } ct_orig; /* Conntrack original direction fields. */
+ struct {
+ u8 sha[ETH_ALEN]; /* ARP source hardware address. */
+ u8 tha[ETH_ALEN]; /* ARP target hardware address. */
+ } arp;
+ };
} ipv4;
struct {
struct {
@@ -111,23 +130,40 @@ struct sw_flow_key {
struct in6_addr dst; /* IPv6 destination address. */
} addr;
__be32 label; /* IPv6 flow label. */
- struct {
- struct in6_addr target; /* ND target address. */
- u8 sll[ETH_ALEN]; /* ND source link layer address. */
- u8 tll[ETH_ALEN]; /* ND target link layer address. */
- } nd;
+ union {
+ struct {
+ struct in6_addr src;
+ struct in6_addr dst;
+ } ct_orig; /* Conntrack original direction fields. */
+ struct {
+ struct in6_addr target; /* ND target address. */
+ u8 sll[ETH_ALEN]; /* ND source link layer address. */
+ u8 tll[ETH_ALEN]; /* ND target link layer address. */
+ } nd;
+ };
} ipv6;
};
struct {
- /* Connection tracking fields. */
- u16 zone;
+ /* Connection tracking fields not packed above. */
+ struct {
+ __be16 src; /* CT orig tuple tp src port. */
+ __be16 dst; /* CT orig tuple tp dst port. */
+ } orig_tp;
u32 mark;
- u8 state;
struct ovs_key_ct_labels labels;
} ct;
} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
+static inline bool sw_flow_key_is_nd(const struct sw_flow_key *key)
+{
+ return key->eth.type == htons(ETH_P_IPV6) &&
+ key->ip.proto == NEXTHDR_ICMP &&
+ key->tp.dst == 0 &&
+ (key->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
+ key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT));
+}
+
struct sw_flow_key_range {
unsigned short int start;
unsigned short int end;
@@ -206,6 +242,21 @@ struct arp_eth_header {
unsigned char ar_tip[4]; /* target IP address */
} __packed;
+static inline u8 ovs_key_mac_proto(const struct sw_flow_key *key)
+{
+ return key->mac_proto & ~SW_FLOW_KEY_INVALID;
+}
+
+static inline u16 __ovs_mac_header_len(u8 mac_proto)
+{
+ return mac_proto == MAC_PROTO_ETHERNET ? ETH_HLEN : 0;
+}
+
+static inline u16 ovs_mac_header_len(const struct sw_flow_key *key)
+{
+ return __ovs_mac_header_len(ovs_key_mac_proto(key));
+}
+
static inline bool ovs_identifier_is_ufid(const struct sw_flow_id *sfid)
{
return sfid->ufid_len;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index ae25ded82b3b..1105a838bab8 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -123,13 +123,15 @@ static void update_range(struct sw_flow_match *match,
static bool match_validate(const struct sw_flow_match *match,
u64 key_attrs, u64 mask_attrs, bool log)
{
- u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET;
+ u64 key_expected = 0;
u64 mask_allowed = key_attrs; /* At most allow all key attributes */
/* The following mask attributes allowed only if they
* pass the validation tests. */
mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
+ | (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)
| (1 << OVS_KEY_ATTR_IPV6)
+ | (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)
| (1 << OVS_KEY_ATTR_TCP)
| (1 << OVS_KEY_ATTR_TCP_FLAGS)
| (1 << OVS_KEY_ATTR_UDP)
@@ -161,8 +163,10 @@ static bool match_validate(const struct sw_flow_match *match,
if (match->key->eth.type == htons(ETH_P_IP)) {
key_expected |= 1 << OVS_KEY_ATTR_IPV4;
- if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
+ mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4;
+ }
if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
if (match->key->ip.proto == IPPROTO_UDP) {
@@ -196,8 +200,10 @@ static bool match_validate(const struct sw_flow_match *match,
if (match->key->eth.type == htons(ETH_P_IPV6)) {
key_expected |= 1 << OVS_KEY_ATTR_IPV6;
- if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
+ mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6;
+ }
if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
if (match->key->ip.proto == IPPROTO_UDP) {
@@ -230,6 +236,12 @@ static bool match_validate(const struct sw_flow_match *match,
htons(NDISC_NEIGHBOUR_SOLICITATION) ||
match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
key_expected |= 1 << OVS_KEY_ATTR_ND;
+ /* Original direction conntrack tuple
+ * uses the same space as the ND fields
+ * in the key, so both are not allowed
+ * at the same time.
+ */
+ mask_allowed &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
if (match->mask && (match->mask->key.tp.src == htons(0xff)))
mask_allowed |= 1 << OVS_KEY_ATTR_ND;
}
@@ -282,7 +294,7 @@ size_t ovs_key_attr_size(void)
/* Whenever adding new OVS_KEY_ FIELDS, we should consider
* updating this function.
*/
- BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 26);
+ BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28);
return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */
+ nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */
@@ -295,6 +307,7 @@ size_t ovs_key_attr_size(void)
+ nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */
+ nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */
+ nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */
+ + nla_total_size(40) /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */
+ nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */
+ nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */
+ nla_total_size(4) /* OVS_KEY_ATTR_VLAN */
@@ -355,6 +368,10 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
[OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) },
[OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) },
[OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) },
+ [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4] = {
+ .len = sizeof(struct ovs_key_ct_tuple_ipv4) },
+ [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = {
+ .len = sizeof(struct ovs_key_ct_tuple_ipv6) },
};
static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
@@ -430,9 +447,8 @@ static int parse_flow_mask_nlattrs(const struct nlattr *attr,
return __parse_flow_nlattrs(attr, a, attrsp, log, true);
}
-static int parse_flow_nlattrs(const struct nlattr *attr,
- const struct nlattr *a[], u64 *attrsp,
- bool log)
+int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[],
+ u64 *attrsp, bool log)
{
return __parse_flow_nlattrs(attr, a, attrsp, log, false);
}
@@ -588,7 +604,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
ipv4 = true;
break;
case OVS_TUNNEL_KEY_ATTR_IPV6_SRC:
- SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst,
+ SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.src,
nla_get_in6_addr(a), is_mask);
ipv6 = true;
break;
@@ -649,6 +665,8 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
tun_flags |= TUNNEL_VXLAN_OPT;
opts_type = type;
break;
+ case OVS_TUNNEL_KEY_ATTR_PAD:
+ break;
default:
OVS_NLERR(log, "Unknown IP tunnel attribute %d",
type);
@@ -969,10 +987,33 @@ static int parse_vlan_from_nlattrs(struct sw_flow_match *match,
return 0;
}
+static int parse_eth_type_from_nlattrs(struct sw_flow_match *match,
+ u64 *attrs, const struct nlattr **a,
+ bool is_mask, bool log)
+{
+ __be16 eth_type;
+
+ eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+ if (is_mask) {
+ /* Always exact match EtherType. */
+ eth_type = htons(0xffff);
+ } else if (!eth_proto_is_802_3(eth_type)) {
+ OVS_NLERR(log, "EtherType %x is less than min %x",
+ ntohs(eth_type), ETH_P_802_3_MIN);
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
+ *attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+ return 0;
+}
+
static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
u64 *attrs, const struct nlattr **a,
bool is_mask, bool log)
{
+ u8 mac_proto = MAC_PROTO_ETHERNET;
+
if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) {
u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]);
@@ -1033,14 +1074,14 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
return -EINVAL;
}
- SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask);
+ SW_FLOW_KEY_PUT(match, ct_state, ct_state, is_mask);
*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE);
}
if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) &&
ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) {
u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]);
- SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask);
+ SW_FLOW_KEY_PUT(match, ct_zone, ct_zone, is_mask);
*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE);
}
if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) &&
@@ -1059,6 +1100,49 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
sizeof(*cl), is_mask);
*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS);
}
+ if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)) {
+ const struct ovs_key_ct_tuple_ipv4 *ct;
+
+ ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4]);
+
+ SW_FLOW_KEY_PUT(match, ipv4.ct_orig.src, ct->ipv4_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.ct_orig.dst, ct->ipv4_dst, is_mask);
+ SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
+ SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
+ SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv4_proto, is_mask);
+ *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4);
+ }
+ if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)) {
+ const struct ovs_key_ct_tuple_ipv6 *ct;
+
+ ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6]);
+
+ SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.src, &ct->ipv6_src,
+ sizeof(match->key->ipv6.ct_orig.src),
+ is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.dst, &ct->ipv6_dst,
+ sizeof(match->key->ipv6.ct_orig.dst),
+ is_mask);
+ SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
+ SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
+ SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv6_proto, is_mask);
+ *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
+ }
+
+ /* For layer 3 packets the Ethernet type is provided
+ * and treated as metadata but no MAC addresses are provided.
+ */
+ if (!(*attrs & (1ULL << OVS_KEY_ATTR_ETHERNET)) &&
+ (*attrs & (1ULL << OVS_KEY_ATTR_ETHERTYPE)))
+ mac_proto = MAC_PROTO_NONE;
+
+ /* Always exact match mac_proto */
+ SW_FLOW_KEY_PUT(match, mac_proto, is_mask ? 0xff : mac_proto, is_mask);
+
+ if (mac_proto == MAC_PROTO_NONE)
+ return parse_eth_type_from_nlattrs(match, attrs, a, is_mask,
+ log);
+
return 0;
}
@@ -1081,33 +1165,26 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
SW_FLOW_KEY_MEMCPY(match, eth.dst,
eth_key->eth_dst, ETH_ALEN, is_mask);
attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
- }
- if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
- /* VLAN attribute is always parsed before getting here since it
- * may occur multiple times.
- */
- OVS_NLERR(log, "VLAN attribute unexpected.");
- return -EINVAL;
- }
-
- if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
- __be16 eth_type;
-
- eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
- if (is_mask) {
- /* Always exact match EtherType. */
- eth_type = htons(0xffff);
- } else if (!eth_proto_is_802_3(eth_type)) {
- OVS_NLERR(log, "EtherType %x is less than min %x",
- ntohs(eth_type), ETH_P_802_3_MIN);
+ if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
+ /* VLAN attribute is always parsed before getting here since it
+ * may occur multiple times.
+ */
+ OVS_NLERR(log, "VLAN attribute unexpected.");
return -EINVAL;
}
- SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
- } else if (!is_mask) {
- SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
+ if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
+ err = parse_eth_type_from_nlattrs(match, &attrs, a, is_mask,
+ log);
+ if (err)
+ return err;
+ } else if (!is_mask) {
+ SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
+ }
+ } else if (!match->key->eth.type) {
+ OVS_NLERR(log, "Either Ethernet header or EtherType is required.");
+ return -EINVAL;
}
if (attrs & (1 << OVS_KEY_ATTR_IPV4)) {
@@ -1462,9 +1539,12 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
/**
* ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key.
- * @key: Receives extracted in_port, priority, tun_key and skb_mark.
- * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
- * sequence.
+ * @net: Network namespace.
+ * @key: Receives extracted in_port, priority, tun_key, skb_mark and conntrack
+ * metadata.
+ * @a: Array of netlink attributes holding parsed %OVS_KEY_ATTR_* Netlink
+ * attributes.
+ * @attrs: Bit mask for the netlink attributes included in @a.
* @log: Boolean to allow kernel error logging. Normally true, but when
* probing for feature compatibility this should be passed in as false to
* suppress unnecessary error logging.
@@ -1473,25 +1553,26 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
* take the same form accepted by flow_from_nlattrs(), but only enough of it to
* get the metadata, that is, the parts of the flow key that cannot be
* extracted from the packet itself.
+ *
+ * This must be called before the packet key fields are filled in 'key'.
*/
-int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr,
- struct sw_flow_key *key,
- bool log)
+int ovs_nla_get_flow_metadata(struct net *net,
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1],
+ u64 attrs, struct sw_flow_key *key, bool log)
{
- const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
struct sw_flow_match match;
- u64 attrs = 0;
- int err;
-
- err = parse_flow_nlattrs(attr, a, &attrs, log);
- if (err)
- return -EINVAL;
memset(&match, 0, sizeof(match));
match.key = key;
+ key->ct_state = 0;
+ key->ct_zone = 0;
+ key->ct_orig_proto = 0;
memset(&key->ct, 0, sizeof(key->ct));
+ memset(&key->ipv4.ct_orig, 0, sizeof(key->ipv4.ct_orig));
+ memset(&key->ipv6.ct_orig, 0, sizeof(key->ipv6.ct_orig));
+
key->phy.in_port = DP_MAX_PORTS;
return metadata_from_nlattrs(net, &match, &attrs, a, false, log);
@@ -1553,45 +1634,47 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
goto nla_put_failure;
- if (ovs_ct_put_key(output, skb))
+ if (ovs_ct_put_key(swkey, output, skb))
goto nla_put_failure;
- nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
- if (!nla)
- goto nla_put_failure;
-
- eth_key = nla_data(nla);
- ether_addr_copy(eth_key->eth_src, output->eth.src);
- ether_addr_copy(eth_key->eth_dst, output->eth.dst);
-
- if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) {
- if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask))
+ if (ovs_key_mac_proto(swkey) == MAC_PROTO_ETHERNET) {
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
+ if (!nla)
goto nla_put_failure;
- encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
- if (!swkey->eth.vlan.tci)
- goto unencap;
- if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) {
- if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask))
+ eth_key = nla_data(nla);
+ ether_addr_copy(eth_key->eth_src, output->eth.src);
+ ether_addr_copy(eth_key->eth_dst, output->eth.dst);
+
+ if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) {
+ if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask))
goto nla_put_failure;
- in_encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
- if (!swkey->eth.cvlan.tci)
+ encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
+ if (!swkey->eth.vlan.tci)
goto unencap;
+
+ if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) {
+ if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask))
+ goto nla_put_failure;
+ in_encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
+ if (!swkey->eth.cvlan.tci)
+ goto unencap;
+ }
}
- }
- if (swkey->eth.type == htons(ETH_P_802_2)) {
- /*
- * Ethertype 802.2 is represented in the netlink with omitted
- * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
- * 0xffff in the mask attribute. Ethertype can also
- * be wildcarded.
- */
- if (is_mask && output->eth.type)
- if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
- output->eth.type))
- goto nla_put_failure;
- goto unencap;
+ if (swkey->eth.type == htons(ETH_P_802_2)) {
+ /*
+ * Ethertype 802.2 is represented in the netlink with omitted
+ * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
+ * 0xffff in the mask attribute. Ethertype can also
+ * be wildcarded.
+ */
+ if (is_mask && output->eth.type)
+ if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
+ output->eth.type))
+ goto nla_put_failure;
+ goto unencap;
+ }
}
if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
@@ -2126,8 +2209,8 @@ static bool validate_masked(u8 *data, int len)
static int validate_set(const struct nlattr *a,
const struct sw_flow_key *flow_key,
- struct sw_flow_actions **sfa,
- bool *skip_copy, __be16 eth_type, bool masked, bool log)
+ struct sw_flow_actions **sfa, bool *skip_copy,
+ u8 mac_proto, __be16 eth_type, bool masked, bool log)
{
const struct nlattr *ovs_key = nla_data(a);
int key_type = nla_type(ovs_key);
@@ -2157,7 +2240,11 @@ static int validate_set(const struct nlattr *a,
case OVS_KEY_ATTR_SKB_MARK:
case OVS_KEY_ATTR_CT_MARK:
case OVS_KEY_ATTR_CT_LABELS:
+ break;
+
case OVS_KEY_ATTR_ETHERNET:
+ if (mac_proto != MAC_PROTO_ETHERNET)
+ return -EINVAL;
break;
case OVS_KEY_ATTR_TUNNEL:
@@ -2324,6 +2411,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
int depth, struct sw_flow_actions **sfa,
__be16 eth_type, __be16 vlan_tci, bool log)
{
+ u8 mac_proto = ovs_key_mac_proto(key);
const struct nlattr *a;
int rem, err;
@@ -2346,6 +2434,8 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
[OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash),
[OVS_ACTION_ATTR_CT] = (u32)-1,
[OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc),
+ [OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth),
+ [OVS_ACTION_ATTR_POP_ETH] = 0,
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
@@ -2394,10 +2484,14 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
}
case OVS_ACTION_ATTR_POP_VLAN:
+ if (mac_proto != MAC_PROTO_ETHERNET)
+ return -EINVAL;
vlan_tci = htons(0);
break;
case OVS_ACTION_ATTR_PUSH_VLAN:
+ if (mac_proto != MAC_PROTO_ETHERNET)
+ return -EINVAL;
vlan = nla_data(a);
if (!eth_type_vlan(vlan->vlan_tpid))
return -EINVAL;
@@ -2447,14 +2541,16 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
case OVS_ACTION_ATTR_SET:
err = validate_set(a, key, sfa,
- &skip_copy, eth_type, false, log);
+ &skip_copy, mac_proto, eth_type,
+ false, log);
if (err)
return err;
break;
case OVS_ACTION_ATTR_SET_MASKED:
err = validate_set(a, key, sfa,
- &skip_copy, eth_type, true, log);
+ &skip_copy, mac_proto, eth_type,
+ true, log);
if (err)
return err;
break;
@@ -2474,6 +2570,22 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
skip_copy = true;
break;
+ case OVS_ACTION_ATTR_PUSH_ETH:
+ /* Disallow pushing an Ethernet header if one
+ * is already present */
+ if (mac_proto != MAC_PROTO_NONE)
+ return -EINVAL;
+ mac_proto = MAC_PROTO_NONE;
+ break;
+
+ case OVS_ACTION_ATTR_POP_ETH:
+ if (mac_proto != MAC_PROTO_ETHERNET)
+ return -EINVAL;
+ if (vlan_tci & htons(VLAN_TAG_PRESENT))
+ return -EINVAL;
+ mac_proto = MAC_PROTO_ETHERNET;
+ break;
+
default:
OVS_NLERR(log, "Unknown Action type %d", type);
return -EINVAL;
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 45f9769e5aac..929c665ac3aa 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -46,8 +46,11 @@ void ovs_match_init(struct sw_flow_match *match,
int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *,
int attr, bool is_mask, struct sk_buff *);
-int ovs_nla_get_flow_metadata(struct net *, const struct nlattr *,
- struct sw_flow_key *, bool log);
+int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[],
+ u64 *attrsp, bool log);
+int ovs_nla_get_flow_metadata(struct net *net,
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1],
+ u64 attrs, struct sw_flow_key *key, bool log);
int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb);
int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb);
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index e7da29021b38..89193a634da4 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -89,15 +89,6 @@ static const struct ethtool_ops internal_dev_ethtool_ops = {
.get_link = ethtool_op_get_link,
};
-static int internal_dev_change_mtu(struct net_device *netdev, int new_mtu)
-{
- if (new_mtu < 68)
- return -EINVAL;
-
- netdev->mtu = new_mtu;
- return 0;
-}
-
static void internal_dev_destructor(struct net_device *dev)
{
struct vport *vport = ovs_internal_dev_get_vport(dev);
@@ -106,7 +97,7 @@ static void internal_dev_destructor(struct net_device *dev)
free_netdev(dev);
}
-static struct rtnl_link_stats64 *
+static void
internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
int i;
@@ -134,8 +125,6 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
stats->tx_bytes += local_stats.tx_bytes;
stats->tx_packets += local_stats.tx_packets;
}
-
- return stats;
}
static void internal_set_rx_headroom(struct net_device *dev, int new_hr)
@@ -148,7 +137,6 @@ static const struct net_device_ops internal_dev_netdev_ops = {
.ndo_stop = internal_dev_stop,
.ndo_start_xmit = internal_dev_xmit,
.ndo_set_mac_address = eth_mac_addr,
- .ndo_change_mtu = internal_dev_change_mtu,
.ndo_get_stats64 = internal_get_stats,
.ndo_set_rx_headroom = internal_set_rx_headroom,
};
@@ -161,6 +149,8 @@ static void do_setup(struct net_device *netdev)
{
ether_setup(netdev);
+ netdev->max_mtu = ETH_MAX_MTU;
+
netdev->netdev_ops = &internal_dev_netdev_ops;
netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 4e3972344aa6..0389398fa4ab 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -57,8 +57,10 @@ static void netdev_port_receive(struct sk_buff *skb)
if (unlikely(!skb))
return;
- skb_push(skb, ETH_HLEN);
- skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
+ if (skb->dev->type == ARPHRD_ETHER) {
+ skb_push(skb, ETH_HLEN);
+ skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
+ }
ovs_vport_receive(vport, skb, skb_tunnel_info(skb));
return;
error:
@@ -97,7 +99,8 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name)
}
if (vport->dev->flags & IFF_LOOPBACK ||
- vport->dev->type != ARPHRD_ETHER ||
+ (vport->dev->type != ARPHRD_ETHER &&
+ vport->dev->type != ARPHRD_NONE) ||
ovs_is_internal_dev(vport->dev)) {
err = -EINVAL;
goto error_put;
@@ -162,7 +165,6 @@ void ovs_netdev_detach_dev(struct vport *vport)
netdev_master_upper_dev_get(vport->dev));
dev_set_promiscuity(vport->dev, -1);
}
-EXPORT_SYMBOL_GPL(ovs_netdev_detach_dev);
static void netdev_destroy(struct vport *vport)
{
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 7387418ac514..b6c8524032a0 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -463,27 +463,11 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
ovs_dp_process_packet(skb, &key);
return 0;
}
-EXPORT_SYMBOL_GPL(ovs_vport_receive);
-static void free_vport_rcu(struct rcu_head *rcu)
+static unsigned int packet_length(const struct sk_buff *skb,
+ struct net_device *dev)
{
- struct vport *vport = container_of(rcu, struct vport, rcu);
-
- ovs_vport_free(vport);
-}
-
-void ovs_vport_deferred_free(struct vport *vport)
-{
- if (!vport)
- return;
-
- call_rcu(&vport->rcu, free_vport_rcu);
-}
-EXPORT_SYMBOL_GPL(ovs_vport_deferred_free);
-
-static unsigned int packet_length(const struct sk_buff *skb)
-{
- unsigned int length = skb->len - ETH_HLEN;
+ unsigned int length = skb->len - dev->hard_header_len;
if (!skb_vlan_tag_present(skb) &&
eth_type_vlan(skb->protocol))
@@ -497,14 +481,34 @@ static unsigned int packet_length(const struct sk_buff *skb)
return length;
}
-void ovs_vport_send(struct vport *vport, struct sk_buff *skb)
+void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto)
{
int mtu = vport->dev->mtu;
- if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
+ switch (vport->dev->type) {
+ case ARPHRD_NONE:
+ if (mac_proto == MAC_PROTO_ETHERNET) {
+ skb_reset_network_header(skb);
+ skb_reset_mac_len(skb);
+ skb->protocol = htons(ETH_P_TEB);
+ } else if (mac_proto != MAC_PROTO_NONE) {
+ WARN_ON_ONCE(1);
+ goto drop;
+ }
+ break;
+ case ARPHRD_ETHER:
+ if (mac_proto != MAC_PROTO_ETHERNET)
+ goto drop;
+ break;
+ default:
+ goto drop;
+ }
+
+ if (unlikely(packet_length(skb, vport->dev) > mtu &&
+ !skb_is_gso(skb))) {
net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
vport->dev->name,
- packet_length(skb), mtu);
+ packet_length(skb, vport->dev), mtu);
vport->dev->stats.tx_errors++;
goto drop;
}
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index f01f28a567ad..cda66c26ad08 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -149,7 +149,6 @@ struct vport_ops {
struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *,
const struct vport_parms *);
void ovs_vport_free(struct vport *);
-void ovs_vport_deferred_free(struct vport *vport);
#define VPORT_ALIGN 8
@@ -198,6 +197,6 @@ int __ovs_vport_ops_register(struct vport_ops *ops);
})
void ovs_vport_ops_unregister(struct vport_ops *ops);
-void ovs_vport_send(struct vport *vport, struct sk_buff *skb);
+void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto);
#endif /* vport.h */
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index dd2332390c45..8489beff5c25 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -73,7 +73,7 @@
#include <net/sock.h>
#include <linux/errno.h>
#include <linux/timer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <asm/page.h>
#include <asm/cacheflush.h>
@@ -409,6 +409,9 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
flush_dcache_page(pgv_to_page(&h.h2->tp_status));
break;
case TPACKET_V3:
+ h.h3->tp_status = status;
+ flush_dcache_page(pgv_to_page(&h.h3->tp_status));
+ break;
default:
WARN(1, "TPACKET version not supported.\n");
BUG();
@@ -432,6 +435,8 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
flush_dcache_page(pgv_to_page(&h.h2->tp_status));
return h.h2->tp_status;
case TPACKET_V3:
+ flush_dcache_page(pgv_to_page(&h.h3->tp_status));
+ return h.h3->tp_status;
default:
WARN(1, "TPACKET version not supported.\n");
BUG();
@@ -476,6 +481,9 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
h.h2->tp_nsec = ts.tv_nsec;
break;
case TPACKET_V3:
+ h.h3->tp_sec = ts.tv_sec;
+ h.h3->tp_nsec = ts.tv_nsec;
+ break;
default:
WARN(1, "TPACKET version not supported.\n");
BUG();
@@ -1497,6 +1505,8 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po)
f->arr[f->num_members] = sk;
smp_wmb();
f->num_members++;
+ if (f->num_members == 1)
+ dev_add_pack(&f->prot_hook);
spin_unlock(&f->lock);
}
@@ -1513,6 +1523,8 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
BUG_ON(i >= f->num_members);
f->arr[i] = f->arr[f->num_members - 1];
f->num_members--;
+ if (f->num_members == 0)
+ __dev_remove_pack(&f->prot_hook);
spin_unlock(&f->lock);
}
@@ -1619,6 +1631,7 @@ static void fanout_release_data(struct packet_fanout *f)
static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
{
+ struct packet_rollover *rollover = NULL;
struct packet_sock *po = pkt_sk(sk);
struct packet_fanout *f, *match;
u8 type = type_flags & 0xff;
@@ -1641,23 +1654,28 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
return -EINVAL;
}
+ mutex_lock(&fanout_mutex);
+
+ err = -EINVAL;
if (!po->running)
- return -EINVAL;
+ goto out;
+ err = -EALREADY;
if (po->fanout)
- return -EALREADY;
+ goto out;
if (type == PACKET_FANOUT_ROLLOVER ||
(type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
- po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL);
- if (!po->rollover)
- return -ENOMEM;
- atomic_long_set(&po->rollover->num, 0);
- atomic_long_set(&po->rollover->num_huge, 0);
- atomic_long_set(&po->rollover->num_failed, 0);
+ err = -ENOMEM;
+ rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
+ if (!rollover)
+ goto out;
+ atomic_long_set(&rollover->num, 0);
+ atomic_long_set(&rollover->num_huge, 0);
+ atomic_long_set(&rollover->num_failed, 0);
+ po->rollover = rollover;
}
- mutex_lock(&fanout_mutex);
match = NULL;
list_for_each_entry(f, &fanout_list, list) {
if (f->id == id &&
@@ -1687,7 +1705,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
match->prot_hook.func = packet_rcv_fanout;
match->prot_hook.af_packet_priv = match;
match->prot_hook.id_match = match_fanout_group;
- dev_add_pack(&match->prot_hook);
list_add(&match->list, &fanout_list);
}
err = -EINVAL;
@@ -1704,36 +1721,40 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
}
}
out:
- mutex_unlock(&fanout_mutex);
- if (err) {
- kfree(po->rollover);
+ if (err && rollover) {
+ kfree(rollover);
po->rollover = NULL;
}
+ mutex_unlock(&fanout_mutex);
return err;
}
-static void fanout_release(struct sock *sk)
+/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
+ * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
+ * It is the responsibility of the caller to call fanout_release_data() and
+ * free the returned packet_fanout (after synchronize_net())
+ */
+static struct packet_fanout *fanout_release(struct sock *sk)
{
struct packet_sock *po = pkt_sk(sk);
struct packet_fanout *f;
+ mutex_lock(&fanout_mutex);
f = po->fanout;
- if (!f)
- return;
+ if (f) {
+ po->fanout = NULL;
- mutex_lock(&fanout_mutex);
- po->fanout = NULL;
+ if (atomic_dec_and_test(&f->sk_ref))
+ list_del(&f->list);
+ else
+ f = NULL;
- if (atomic_dec_and_test(&f->sk_ref)) {
- list_del(&f->list);
- dev_remove_pack(&f->prot_hook);
- fanout_release_data(f);
- kfree(f);
+ if (po->rollover)
+ kfree_rcu(po->rollover, rcu);
}
mutex_unlock(&fanout_mutex);
- if (po->rollover)
- kfree_rcu(po->rollover, rcu);
+ return f;
}
static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
@@ -1967,17 +1988,6 @@ static unsigned int run_filter(struct sk_buff *skb,
return res;
}
-static int __packet_rcv_vnet(const struct sk_buff *skb,
- struct virtio_net_hdr *vnet_hdr)
-{
- *vnet_hdr = (const struct virtio_net_hdr) { 0 };
-
- if (virtio_net_hdr_from_skb(skb, vnet_hdr, vio_le()))
- BUG();
-
- return 0;
-}
-
static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
size_t *len)
{
@@ -1987,7 +1997,7 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
return -EINVAL;
*len -= sizeof(vnet_hdr);
- if (__packet_rcv_vnet(skb, &vnet_hdr))
+ if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true))
return -EINVAL;
return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
@@ -2246,8 +2256,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
spin_unlock(&sk->sk_receive_queue.lock);
if (po->has_vnet_hdr) {
- if (__packet_rcv_vnet(skb, h.raw + macoff -
- sizeof(struct virtio_net_hdr))) {
+ if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
+ sizeof(struct virtio_net_hdr),
+ vio_le(), true)) {
spin_lock(&sk->sk_receive_queue.lock);
goto drop_n_account;
}
@@ -2390,8 +2401,6 @@ static void tpacket_set_protocol(const struct net_device *dev,
static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
{
- unsigned short gso_type = 0;
-
if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
(__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
__virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
@@ -2403,69 +2412,22 @@ static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
return -EINVAL;
- if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
- switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
- case VIRTIO_NET_HDR_GSO_TCPV4:
- gso_type = SKB_GSO_TCPV4;
- break;
- case VIRTIO_NET_HDR_GSO_TCPV6:
- gso_type = SKB_GSO_TCPV6;
- break;
- case VIRTIO_NET_HDR_GSO_UDP:
- gso_type = SKB_GSO_UDP;
- break;
- default:
- return -EINVAL;
- }
-
- if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
- gso_type |= SKB_GSO_TCP_ECN;
-
- if (vnet_hdr->gso_size == 0)
- return -EINVAL;
- }
-
- vnet_hdr->gso_type = gso_type; /* changes type, temporary storage */
return 0;
}
static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
struct virtio_net_hdr *vnet_hdr)
{
- int n;
-
if (*len < sizeof(*vnet_hdr))
return -EINVAL;
*len -= sizeof(*vnet_hdr);
- n = copy_from_iter(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter);
- if (n != sizeof(*vnet_hdr))
+ if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
return -EFAULT;
return __packet_snd_vnet_parse(vnet_hdr, *len);
}
-static int packet_snd_vnet_gso(struct sk_buff *skb,
- struct virtio_net_hdr *vnet_hdr)
-{
- if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
- u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start);
- u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset);
-
- if (!skb_partial_csum_set(skb, s, o))
- return -EINVAL;
- }
-
- skb_shinfo(skb)->gso_size =
- __virtio16_to_cpu(vio_le(), vnet_hdr->gso_size);
- skb_shinfo(skb)->gso_type = vnet_hdr->gso_type;
-
- /* Header must be checked, and gso_segs computed. */
- skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
- skb_shinfo(skb)->gso_segs = 0;
- return 0;
-}
-
static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
void *frame, struct net_device *dev, void *data, int tp_len,
__be16 proto, unsigned char *addr, int hlen, int copylen,
@@ -2556,6 +2518,13 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
ph.raw = frame;
switch (po->tp_version) {
+ case TPACKET_V3:
+ if (ph.h3->tp_next_offset != 0) {
+ pr_warn_once("variable sized slot not supported");
+ return -EINVAL;
+ }
+ tp_len = ph.h3->tp_len;
+ break;
case TPACKET_V2:
tp_len = ph.h2->tp_len;
break;
@@ -2575,6 +2544,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
off_max = po->tx_ring.frame_size - tp_len;
if (po->sk.sk_type == SOCK_DGRAM) {
switch (po->tp_version) {
+ case TPACKET_V3:
+ off = ph.h3->tp_net;
+ break;
case TPACKET_V2:
off = ph.h2->tp_net;
break;
@@ -2584,6 +2556,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
}
} else {
switch (po->tp_version) {
+ case TPACKET_V3:
+ off = ph.h3->tp_mac;
+ break;
case TPACKET_V2:
off = ph.h2->tp_mac;
break;
@@ -2725,7 +2700,8 @@ tpacket_error:
}
}
- if (po->has_vnet_hdr && packet_snd_vnet_gso(skb, vnet_hdr)) {
+ if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr,
+ vio_le())) {
tp_len = -EINVAL;
goto tpacket_error;
}
@@ -2813,7 +2789,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
struct virtio_net_hdr vnet_hdr = { 0 };
int offset = 0;
struct packet_sock *po = pkt_sk(sk);
- int hlen, tlen;
+ int hlen, tlen, linear;
int extra_len = 0;
/*
@@ -2874,8 +2850,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
err = -ENOBUFS;
hlen = LL_RESERVED_SPACE(dev);
tlen = dev->needed_tailroom;
- skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
- __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len),
+ linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
+ linear = max(linear, min_t(int, len, dev->hard_header_len));
+ skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
msg->msg_flags & MSG_DONTWAIT, &err);
if (skb == NULL)
goto out_unlock;
@@ -2916,7 +2893,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
packet_pick_tx_queue(dev, skb);
if (po->has_vnet_hdr) {
- err = packet_snd_vnet_gso(skb, &vnet_hdr);
+ err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
if (err)
goto out_free;
len += sizeof(vnet_hdr);
@@ -2964,6 +2941,7 @@ static int packet_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct packet_sock *po;
+ struct packet_fanout *f;
struct net *net;
union tpacket_req_u req_u;
@@ -3003,9 +2981,14 @@ static int packet_release(struct socket *sock)
packet_set_ring(sk, &req_u, 1, 1);
}
- fanout_release(sk);
+ f = fanout_release(sk);
synchronize_net();
+
+ if (f) {
+ fanout_release_data(f);
+ kfree(f);
+ }
/*
* Now the socket is dead. No more input will appear.
*/
@@ -3120,7 +3103,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
int addr_len)
{
struct sock *sk = sock->sk;
- char name[15];
+ char name[sizeof(uaddr->sa_data) + 1];
/*
* Check legality
@@ -3128,7 +3111,11 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
if (addr_len != sizeof(struct sockaddr))
return -EINVAL;
- strlcpy(name, uaddr->sa_data, sizeof(name));
+ /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
+ * zero-terminated.
+ */
+ memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
+ name[sizeof(uaddr->sa_data)] = 0;
return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
}
@@ -3678,6 +3665,8 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
return -EBUSY;
if (copy_from_user(&val, optval, sizeof(val)))
return -EFAULT;
+ if (val > INT_MAX)
+ return -EINVAL;
po->tp_reserve = val;
return 0;
}
@@ -3957,7 +3946,6 @@ static int packet_notifier(struct notifier_block *this,
}
if (msg == NETDEV_UNREGISTER) {
packet_cached_dev_reset(po);
- fanout_release(sk);
po->ifindex = -1;
if (po->prot_hook.dev)
dev_put(po->prot_hook.dev);
@@ -4171,11 +4159,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
struct tpacket_req *req = &req_u->req;
lock_sock(sk);
- /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
- if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
- net_warn_ratelimited("Tx-ring is not supported.\n");
- goto out;
- }
rb = tx_ring ? &po->tx_ring : &po->rx_ring;
rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
@@ -4212,8 +4195,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
goto out;
if (po->tp_version >= TPACKET_V3 &&
- (int)(req->tp_block_size -
- BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
+ req->tp_block_size <=
+ BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv))
goto out;
if (unlikely(req->tp_frame_size < po->tp_hdrlen +
po->tp_reserve))
@@ -4224,6 +4207,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
if (unlikely(rb->frames_per_block == 0))
goto out;
+ if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
+ goto out;
if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
req->tp_frame_nr))
goto out;
@@ -4235,11 +4220,19 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
goto out;
switch (po->tp_version) {
case TPACKET_V3:
- /* Transmit path is not supported. We checked
- * it above but just being paranoid
- */
- if (!tx_ring)
+ /* Block transmit is not supported yet */
+ if (!tx_ring) {
init_prb_bdqc(po, rb, pg_vec, req_u);
+ } else {
+ struct tpacket_req3 *req3 = &req_u->req3;
+
+ if (req3->tp_retire_blk_tov ||
+ req3->tp_sizeof_priv ||
+ req3->tp_feature_req_word) {
+ err = -EINVAL;
+ goto out;
+ }
+ }
break;
default:
break;
diff --git a/net/packet/diag.c b/net/packet/diag.c
index 0ed68f0238bf..7ef1c881ae74 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -73,8 +73,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
{
struct packet_diag_ring pdr;
- if (!ring->pg_vec || ((ver > TPACKET_V2) &&
- (nl_type == PACKET_DIAG_TX_RING)))
+ if (!ring->pg_vec)
return 0;
pdr.pdr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c
index fa8237fdc57b..21c28b51be94 100644
--- a/net/phonet/pep-gprs.c
+++ b/net/phonet/pep-gprs.c
@@ -217,20 +217,10 @@ static netdev_tx_t gprs_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
}
-static int gprs_set_mtu(struct net_device *dev, int new_mtu)
-{
- if ((new_mtu < 576) || (new_mtu > (PHONET_MAX_MTU - 11)))
- return -EINVAL;
-
- dev->mtu = new_mtu;
- return 0;
-}
-
static const struct net_device_ops gprs_netdev_ops = {
.ndo_open = gprs_open,
.ndo_stop = gprs_close,
.ndo_start_xmit = gprs_xmit,
- .ndo_change_mtu = gprs_set_mtu,
};
static void gprs_setup(struct net_device *dev)
@@ -239,6 +229,8 @@ static void gprs_setup(struct net_device *dev)
dev->type = ARPHRD_PHONET_PIPE;
dev->flags = IFF_POINTOPOINT | IFF_NOARP;
dev->mtu = GPRS_DEFAULT_MTU;
+ dev->min_mtu = 576;
+ dev->max_mtu = (PHONET_MAX_MTU - 11);
dev->hard_header_len = 0;
dev->addr_len = 0;
dev->tx_queue_len = 10;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 850a86cde0b3..e81537991ddf 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -23,6 +23,7 @@
*/
#include <linux/kernel.h>
+#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/socket.h>
#include <net/sock.h>
@@ -771,7 +772,8 @@ static void pep_sock_close(struct sock *sk, long timeout)
sock_put(sk);
}
-static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp)
+static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp,
+ bool kern)
{
struct pep_sock *pn = pep_sk(sk), *newpn;
struct sock *newsk = NULL;
@@ -845,7 +847,8 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp)
}
/* Create a new to-be-accepted sock */
- newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, 0);
+ newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot,
+ kern);
if (!newsk) {
pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL);
err = -ENOBUFS;
@@ -1167,7 +1170,7 @@ disabled:
/* Wait until flow control allows TX */
done = atomic_read(&pn->tx_credits);
while (!done) {
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
if (!timeo) {
err = -EAGAIN;
@@ -1178,10 +1181,9 @@ disabled:
goto out;
}
- prepare_to_wait(sk_sleep(sk), &wait,
- TASK_INTERRUPTIBLE);
- done = sk_wait_event(sk, &timeo, atomic_read(&pn->tx_credits));
- finish_wait(sk_sleep(sk), &wait);
+ add_wait_queue(sk_sleep(sk), &wait);
+ done = sk_wait_event(sk, &timeo, atomic_read(&pn->tx_credits), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
if (sk->sk_state != TCP_ESTABLISHED)
goto disabled;
diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c
index a58680016472..2cb4c5dfad6f 100644
--- a/net/phonet/pn_dev.c
+++ b/net/phonet/pn_dev.c
@@ -44,7 +44,7 @@ struct phonet_net {
struct phonet_routes routes;
};
-static int phonet_net_id __read_mostly;
+static unsigned int phonet_net_id __read_mostly;
static struct phonet_net *phonet_pernet(struct net *net)
{
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index ffd5f2297584..64634e3ec2fc 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -27,6 +27,8 @@
#include <linux/kernel.h>
#include <linux/net.h>
#include <linux/poll.h>
+#include <linux/sched/signal.h>
+
#include <net/sock.h>
#include <net/tcp_states.h>
@@ -303,7 +305,7 @@ out:
}
static int pn_socket_accept(struct socket *sock, struct socket *newsock,
- int flags)
+ int flags, bool kern)
{
struct sock *sk = sock->sk;
struct sock *newsk;
@@ -312,7 +314,7 @@ static int pn_socket_accept(struct socket *sock, struct socket *newsock,
if (unlikely(sk->sk_state != TCP_LISTEN))
return -EINVAL;
- newsk = sk->sk_prot->accept(sk, flags, &err);
+ newsk = sk->sk_prot->accept(sk, flags, &err, kern);
if (!newsk)
return err;
diff --git a/net/psample/Kconfig b/net/psample/Kconfig
new file mode 100644
index 000000000000..d850246a6059
--- /dev/null
+++ b/net/psample/Kconfig
@@ -0,0 +1,15 @@
+#
+# psample packet sampling configuration
+#
+
+menuconfig PSAMPLE
+ depends on NET
+ tristate "Packet-sampling netlink channel"
+ default n
+ help
+ Say Y here to add support for packet-sampling netlink channel
+ This netlink channel allows transferring packets alongside some
+ metadata to userspace.
+
+ To compile this support as a module, choose M here: the module will
+ be called psample.
diff --git a/net/psample/Makefile b/net/psample/Makefile
new file mode 100644
index 000000000000..609b0a79c9f3
--- /dev/null
+++ b/net/psample/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the psample netlink channel
+#
+
+obj-$(CONFIG_PSAMPLE) += psample.o
diff --git a/net/psample/psample.c b/net/psample/psample.c
new file mode 100644
index 000000000000..8aa58a918783
--- /dev/null
+++ b/net/psample/psample.c
@@ -0,0 +1,301 @@
+/*
+ * net/psample/psample.c - Netlink channel for packet sampling
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/psample.h>
+#include <linux/spinlock.h>
+
+#define PSAMPLE_MAX_PACKET_SIZE 0xffff
+
+static LIST_HEAD(psample_groups_list);
+static DEFINE_SPINLOCK(psample_groups_lock);
+
+/* multicast groups */
+enum psample_nl_multicast_groups {
+ PSAMPLE_NL_MCGRP_CONFIG,
+ PSAMPLE_NL_MCGRP_SAMPLE,
+};
+
+static const struct genl_multicast_group psample_nl_mcgrps[] = {
+ [PSAMPLE_NL_MCGRP_CONFIG] = { .name = PSAMPLE_NL_MCGRP_CONFIG_NAME },
+ [PSAMPLE_NL_MCGRP_SAMPLE] = { .name = PSAMPLE_NL_MCGRP_SAMPLE_NAME },
+};
+
+static struct genl_family psample_nl_family __ro_after_init;
+
+static int psample_group_nl_fill(struct sk_buff *msg,
+ struct psample_group *group,
+ enum psample_command cmd, u32 portid, u32 seq,
+ int flags)
+{
+ void *hdr;
+ int ret;
+
+ hdr = genlmsg_put(msg, portid, seq, &psample_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ ret = nla_put_u32(msg, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num);
+ if (ret < 0)
+ goto error;
+
+ ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_REFCOUNT, group->refcount);
+ if (ret < 0)
+ goto error;
+
+ ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_SEQ, group->seq);
+ if (ret < 0)
+ goto error;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+error:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct psample_group *group;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ spin_lock(&psample_groups_lock);
+ list_for_each_entry(group, &psample_groups_list, list) {
+ if (!net_eq(group->net, sock_net(msg->sk)))
+ continue;
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = psample_group_nl_fill(msg, group, PSAMPLE_CMD_NEW_GROUP,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI);
+ if (err)
+ break;
+ idx++;
+ }
+
+ spin_unlock(&psample_groups_lock);
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static const struct genl_ops psample_nl_ops[] = {
+ {
+ .cmd = PSAMPLE_CMD_GET_GROUP,
+ .dumpit = psample_nl_cmd_get_group_dumpit,
+ /* can be retrieved by unprivileged users */
+ }
+};
+
+static struct genl_family psample_nl_family __ro_after_init = {
+ .name = PSAMPLE_GENL_NAME,
+ .version = PSAMPLE_GENL_VERSION,
+ .maxattr = PSAMPLE_ATTR_MAX,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .mcgrps = psample_nl_mcgrps,
+ .ops = psample_nl_ops,
+ .n_ops = ARRAY_SIZE(psample_nl_ops),
+ .n_mcgrps = ARRAY_SIZE(psample_nl_mcgrps),
+};
+
+static void psample_group_notify(struct psample_group *group,
+ enum psample_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!msg)
+ return;
+
+ err = psample_group_nl_fill(msg, group, cmd, 0, 0, NLM_F_MULTI);
+ if (!err)
+ genlmsg_multicast_netns(&psample_nl_family, group->net, msg, 0,
+ PSAMPLE_NL_MCGRP_CONFIG, GFP_ATOMIC);
+ else
+ nlmsg_free(msg);
+}
+
+static struct psample_group *psample_group_create(struct net *net,
+ u32 group_num)
+{
+ struct psample_group *group;
+
+ group = kzalloc(sizeof(*group), GFP_ATOMIC);
+ if (!group)
+ return NULL;
+
+ group->net = net;
+ group->group_num = group_num;
+ list_add_tail(&group->list, &psample_groups_list);
+
+ psample_group_notify(group, PSAMPLE_CMD_NEW_GROUP);
+ return group;
+}
+
+static void psample_group_destroy(struct psample_group *group)
+{
+ psample_group_notify(group, PSAMPLE_CMD_DEL_GROUP);
+ list_del(&group->list);
+ kfree(group);
+}
+
+static struct psample_group *
+psample_group_lookup(struct net *net, u32 group_num)
+{
+ struct psample_group *group;
+
+ list_for_each_entry(group, &psample_groups_list, list)
+ if ((group->group_num == group_num) && (group->net == net))
+ return group;
+ return NULL;
+}
+
+struct psample_group *psample_group_get(struct net *net, u32 group_num)
+{
+ struct psample_group *group;
+
+ spin_lock(&psample_groups_lock);
+
+ group = psample_group_lookup(net, group_num);
+ if (!group) {
+ group = psample_group_create(net, group_num);
+ if (!group)
+ goto out;
+ }
+ group->refcount++;
+
+out:
+ spin_unlock(&psample_groups_lock);
+ return group;
+}
+EXPORT_SYMBOL_GPL(psample_group_get);
+
+void psample_group_put(struct psample_group *group)
+{
+ spin_lock(&psample_groups_lock);
+
+ if (--group->refcount == 0)
+ psample_group_destroy(group);
+
+ spin_unlock(&psample_groups_lock);
+}
+EXPORT_SYMBOL_GPL(psample_group_put);
+
+void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
+ u32 trunc_size, int in_ifindex, int out_ifindex,
+ u32 sample_rate)
+{
+ struct sk_buff *nl_skb;
+ int data_len;
+ int meta_len;
+ void *data;
+ int ret;
+
+ meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) +
+ (out_ifindex ? nla_total_size(sizeof(u16)) : 0) +
+ nla_total_size(sizeof(u32)) + /* sample_rate */
+ nla_total_size(sizeof(u32)) + /* orig_size */
+ nla_total_size(sizeof(u32)) + /* group_num */
+ nla_total_size(sizeof(u32)); /* seq */
+
+ data_len = min(skb->len, trunc_size);
+ if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE)
+ data_len = PSAMPLE_MAX_PACKET_SIZE - meta_len - NLA_HDRLEN
+ - NLA_ALIGNTO;
+
+ nl_skb = genlmsg_new(meta_len + data_len, GFP_ATOMIC);
+ if (unlikely(!nl_skb))
+ return;
+
+ data = genlmsg_put(nl_skb, 0, 0, &psample_nl_family, 0,
+ PSAMPLE_CMD_SAMPLE);
+ if (unlikely(!data))
+ goto error;
+
+ if (in_ifindex) {
+ ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_IIFINDEX, in_ifindex);
+ if (unlikely(ret < 0))
+ goto error;
+ }
+
+ if (out_ifindex) {
+ ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OIFINDEX, out_ifindex);
+ if (unlikely(ret < 0))
+ goto error;
+ }
+
+ ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_RATE, sample_rate);
+ if (unlikely(ret < 0))
+ goto error;
+
+ ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_ORIGSIZE, skb->len);
+ if (unlikely(ret < 0))
+ goto error;
+
+ ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num);
+ if (unlikely(ret < 0))
+ goto error;
+
+ ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_GROUP_SEQ, group->seq++);
+ if (unlikely(ret < 0))
+ goto error;
+
+ if (data_len) {
+ int nla_len = nla_total_size(data_len);
+ struct nlattr *nla;
+
+ nla = (struct nlattr *)skb_put(nl_skb, nla_len);
+ nla->nla_type = PSAMPLE_ATTR_DATA;
+ nla->nla_len = nla_attr_size(data_len);
+
+ if (skb_copy_bits(skb, 0, nla_data(nla), data_len))
+ goto error;
+ }
+
+ genlmsg_end(nl_skb, data);
+ genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0,
+ PSAMPLE_NL_MCGRP_SAMPLE, GFP_ATOMIC);
+
+ return;
+error:
+ pr_err_ratelimited("Could not create psample log message\n");
+ nlmsg_free(nl_skb);
+}
+EXPORT_SYMBOL_GPL(psample_sample_packet);
+
+static int __init psample_module_init(void)
+{
+ return genl_register_family(&psample_nl_family);
+}
+
+static void __exit psample_module_exit(void)
+{
+ genl_unregister_family(&psample_nl_family);
+}
+
+module_init(psample_module_init);
+module_exit(psample_module_exit);
+
+MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
+MODULE_DESCRIPTION("netlink channel for packet sampling");
+MODULE_LICENSE("GPL v2");
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index c985ecbe9bd6..9da7368b0140 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -252,7 +252,7 @@ static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node,
const int pkt_len = 20;
struct qrtr_hdr *hdr;
struct sk_buff *skb;
- u32 *buf;
+ __le32 *buf;
skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL);
if (!skb)
@@ -269,7 +269,7 @@ static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node,
hdr->dst_node_id = cpu_to_le32(dst_node);
hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL);
- buf = (u32 *)skb_put(skb, pkt_len);
+ buf = (__le32 *)skb_put(skb, pkt_len);
memset(buf, 0, pkt_len);
buf[0] = cpu_to_le32(QRTR_TYPE_RESUME_TX);
buf[1] = cpu_to_le32(src_node);
@@ -658,7 +658,9 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
}
if (plen != len) {
- skb_pad(skb, plen - len);
+ rc = skb_pad(skb, plen - len);
+ if (rc)
+ goto out_node;
skb_put(skb, plen - len);
}
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 6beaeb1138f3..b405f77d664c 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -298,6 +298,33 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
return 0;
}
+static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval,
+ int optlen)
+{
+ struct rds_rx_trace_so trace;
+ int i;
+
+ if (optlen != sizeof(struct rds_rx_trace_so))
+ return -EFAULT;
+
+ if (copy_from_user(&trace, optval, sizeof(trace)))
+ return -EFAULT;
+
+ if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX)
+ return -EFAULT;
+
+ rs->rs_rx_traces = trace.rx_traces;
+ for (i = 0; i < rs->rs_rx_traces; i++) {
+ if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
+ rs->rs_rx_traces = 0;
+ return -EFAULT;
+ }
+ rs->rs_rx_trace[i] = trace.rx_trace_pos[i];
+ }
+
+ return 0;
+}
+
static int rds_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
@@ -338,6 +365,9 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
release_sock(sock->sk);
break;
+ case SO_RDS_MSG_RXPATH_LATENCY:
+ ret = rds_recv_track_latency(rs, optval, optlen);
+ break;
default:
ret = -ENOPROTOOPT;
}
@@ -484,6 +514,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
INIT_LIST_HEAD(&rs->rs_cong_list);
spin_lock_init(&rs->rs_rdma_lock);
rs->rs_rdma_keys = RB_ROOT;
+ rs->rs_rx_traces = 0;
spin_lock_bh(&rds_sock_lock);
list_add_tail(&rs->rs_item, &rds_sock_list);
@@ -605,10 +636,14 @@ static void rds_exit(void)
}
module_exit(rds_exit);
+u32 rds_gen_num;
+
static int rds_init(void)
{
int ret;
+ net_get_random_once(&rds_gen_num, sizeof(rds_gen_num));
+
ret = rds_bind_lock_init();
if (ret)
goto out;
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 095f6ce583fe..3a915bedb76c 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -176,8 +176,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (!trans) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
- printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, "
- "load rds_tcp or rds_rdma?\n");
+ pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n",
+ __func__, &sin->sin_addr.s_addr);
goto out;
}
diff --git a/net/rds/connection.c b/net/rds/connection.c
index f5058559bb08..1fa75ab7b733 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -269,6 +269,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
kmem_cache_free(rds_conn_slab, conn);
conn = found;
} else {
+ conn->c_my_gen_num = rds_gen_num;
+ conn->c_peer_gen_num = 0;
hlist_add_head_rcu(&conn->c_hash_node, head);
rds_cong_add_conn(conn);
rds_conn_count++;
@@ -427,6 +429,7 @@ void rds_conn_destroy(struct rds_connection *conn)
*/
rds_cong_remove_conn(conn);
+ put_net(conn->c_net);
kmem_cache_free(rds_conn_slab, conn);
spin_lock_irqsave(&rds_conn_lock, flags);
@@ -543,11 +546,11 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
}
EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
-void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
- struct rds_info_iterator *iter,
- struct rds_info_lengths *lens,
- int (*visitor)(struct rds_conn_path *, void *),
- size_t item_len)
+static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int (*visitor)(struct rds_conn_path *, void *),
+ size_t item_len)
{
u64 buffer[(item_len + 7) / 8];
struct hlist_head *head;
@@ -681,6 +684,7 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
!test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
}
+EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
void rds_conn_connect_if_down(struct rds_connection *conn)
{
@@ -689,21 +693,6 @@ void rds_conn_connect_if_down(struct rds_connection *conn)
}
EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
-/*
- * An error occurred on the connection
- */
-void
-__rds_conn_error(struct rds_connection *conn, const char *fmt, ...)
-{
- va_list ap;
-
- va_start(ap, fmt);
- vprintk(fmt, ap);
- va_end(ap);
-
- rds_conn_drop(conn);
-}
-
void
__rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...)
{
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 5680d90b0b77..7a64c8db81ab 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -45,8 +45,8 @@
#include "ib.h"
#include "ib_mr.h"
-unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE;
-unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE;
+static unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE;
+static unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE;
unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
module_param(rds_ib_mr_1m_pool_size, int, 0444);
@@ -111,6 +111,8 @@ static void rds_ib_dev_free(struct work_struct *work)
kfree(i_ipaddr);
}
+ kfree(rds_ibdev->vector_load);
+
kfree(rds_ibdev);
}
@@ -159,6 +161,14 @@ static void rds_ib_add_one(struct ib_device *device)
rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
+ rds_ibdev->vector_load = kzalloc(sizeof(int) * device->num_comp_vectors,
+ GFP_KERNEL);
+ if (!rds_ibdev->vector_load) {
+ pr_err("RDS/IB: %s failed to allocate vector memory\n",
+ __func__);
+ goto put_dev;
+ }
+
rds_ibdev->dev = device;
rds_ibdev->pd = ib_alloc_pd(device, 0);
if (IS_ERR(rds_ibdev->pd)) {
@@ -428,16 +438,12 @@ int rds_ib_init(void)
if (ret)
goto out_sysctl;
- ret = rds_trans_register(&rds_ib_transport);
- if (ret)
- goto out_recv;
+ rds_trans_register(&rds_ib_transport);
rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
goto out;
-out_recv:
- rds_ib_recv_exit();
out_sysctl:
rds_ib_sysctl_exit();
out_ibreg:
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 45ac8e8e58f4..ec550626e221 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -14,9 +14,10 @@
#define RDS_IB_DEFAULT_RECV_WR 1024
#define RDS_IB_DEFAULT_SEND_WR 256
-#define RDS_IB_DEFAULT_FR_WR 512
+#define RDS_IB_DEFAULT_FR_WR 256
+#define RDS_IB_DEFAULT_FR_INV_WR 256
-#define RDS_IB_DEFAULT_RETRY_COUNT 2
+#define RDS_IB_DEFAULT_RETRY_COUNT 1
#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
@@ -125,6 +126,7 @@ struct rds_ib_connection {
/* To control the number of wrs from fastreg */
atomic_t i_fastreg_wrs;
+ atomic_t i_fastunreg_wrs;
/* interrupt handling */
struct tasklet_struct i_send_tasklet;
@@ -134,7 +136,7 @@ struct rds_ib_connection {
struct rds_ib_work_ring i_send_ring;
struct rm_data_op *i_data_op;
struct rds_header *i_send_hdrs;
- u64 i_send_hdrs_dma;
+ dma_addr_t i_send_hdrs_dma;
struct rds_ib_send_work *i_sends;
atomic_t i_signaled_sends;
@@ -144,11 +146,12 @@ struct rds_ib_connection {
struct rds_ib_incoming *i_ibinc;
u32 i_recv_data_rem;
struct rds_header *i_recv_hdrs;
- u64 i_recv_hdrs_dma;
+ dma_addr_t i_recv_hdrs_dma;
struct rds_ib_recv_work *i_recvs;
u64 i_ack_recv; /* last ACK received */
struct rds_ib_refill_cache i_cache_incs;
struct rds_ib_refill_cache i_cache_frags;
+ atomic_t i_cache_allocs;
/* sending acks */
unsigned long i_ack_flags;
@@ -161,7 +164,7 @@ struct rds_ib_connection {
struct rds_header *i_ack;
struct ib_send_wr i_ack_wr;
struct ib_sge i_ack_sge;
- u64 i_ack_dma;
+ dma_addr_t i_ack_dma;
unsigned long i_ack_queued;
/* Flow control related information
@@ -179,6 +182,14 @@ struct rds_ib_connection {
/* Batched completions */
unsigned int i_unsignaled_wrs;
+
+ /* Endpoint role in connection */
+ bool i_active_side;
+ atomic_t i_cq_quiesce;
+
+ /* Send/Recv vectors */
+ int i_scq_vector;
+ int i_rcq_vector;
};
/* This assumes that atomic_t is at least 32 bits */
@@ -221,9 +232,10 @@ struct rds_ib_device {
spinlock_t spinlock; /* protect the above */
atomic_t refcount;
struct work_struct free_work;
+ int *vector_load;
};
-#define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device)
+#define ibdev_to_node(ibdev) dev_to_node((ibdev)->dev.parent)
#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
/* bits for i_ack_flags */
@@ -249,6 +261,8 @@ struct rds_ib_statistics {
uint64_t s_ib_rx_refill_from_cq;
uint64_t s_ib_rx_refill_from_thread;
uint64_t s_ib_rx_alloc_limit;
+ uint64_t s_ib_rx_total_frags;
+ uint64_t s_ib_rx_total_incs;
uint64_t s_ib_rx_credit_updates;
uint64_t s_ib_ack_sent;
uint64_t s_ib_ack_send_failure;
@@ -271,6 +285,8 @@ struct rds_ib_statistics {
uint64_t s_ib_rdma_mr_1m_reused;
uint64_t s_ib_atomic_cswp;
uint64_t s_ib_atomic_fadd;
+ uint64_t s_ib_recv_added_to_cache;
+ uint64_t s_ib_recv_removed_from_cache;
};
extern struct workqueue_struct *rds_ib_wq;
@@ -401,6 +417,8 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
/* ib_stats.c */
DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
+#define rds_ib_stats_add(member, count) \
+ rds_stats_add_which(rds_ib_stats, member, count)
unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 5b2ab95afa07..1c38d2c7caa8 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -113,24 +113,26 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
}
if (conn->c_version < RDS_PROTOCOL(3, 1)) {
- printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
- " no longer supported\n",
- &conn->c_faddr,
- RDS_PROTOCOL_MAJOR(conn->c_version),
- RDS_PROTOCOL_MINOR(conn->c_version));
+ pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n",
+ &conn->c_laddr, &conn->c_faddr,
+ RDS_PROTOCOL_MAJOR(conn->c_version),
+ RDS_PROTOCOL_MINOR(conn->c_version));
rds_conn_destroy(conn);
return;
} else {
- printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
- &conn->c_faddr,
- RDS_PROTOCOL_MAJOR(conn->c_version),
- RDS_PROTOCOL_MINOR(conn->c_version),
- ic->i_flowctl ? ", flow control" : "");
+ pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n",
+ ic->i_active_side ? "Active" : "Passive",
+ &conn->c_laddr, &conn->c_faddr,
+ RDS_PROTOCOL_MAJOR(conn->c_version),
+ RDS_PROTOCOL_MINOR(conn->c_version),
+ ic->i_flowctl ? ", flow control" : "");
}
- /*
- * Init rings and fill recv. this needs to wait until protocol negotiation
- * is complete, since ring layout is different from 3.0 to 3.1.
+ atomic_set(&ic->i_cq_quiesce, 0);
+
+ /* Init rings and fill recv. this needs to wait until protocol
+ * negotiation is complete, since ring layout is different
+ * from 3.1 to 4.1.
*/
rds_ib_send_init_ring(ic);
rds_ib_recv_init_ring(ic);
@@ -267,6 +269,10 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
rds_ib_stats_inc(s_ib_tasklet_call);
+ /* if cq has been already reaped, ignore incoming cq event */
+ if (atomic_read(&ic->i_cq_quiesce))
+ return;
+
poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
@@ -308,6 +314,10 @@ static void rds_ib_tasklet_fn_recv(unsigned long data)
rds_ib_stats_inc(s_ib_tasklet_call);
+ /* if cq has been already reaped, ignore incoming cq event */
+ if (atomic_read(&ic->i_cq_quiesce))
+ return;
+
memset(&state, 0, sizeof(state));
poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
@@ -358,6 +368,28 @@ static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context)
tasklet_schedule(&ic->i_send_tasklet);
}
+static inline int ibdev_get_unused_vector(struct rds_ib_device *rds_ibdev)
+{
+ int min = rds_ibdev->vector_load[rds_ibdev->dev->num_comp_vectors - 1];
+ int index = rds_ibdev->dev->num_comp_vectors - 1;
+ int i;
+
+ for (i = rds_ibdev->dev->num_comp_vectors - 1; i >= 0; i--) {
+ if (rds_ibdev->vector_load[i] < min) {
+ index = i;
+ min = rds_ibdev->vector_load[i];
+ }
+ }
+
+ rds_ibdev->vector_load[index]++;
+ return index;
+}
+
+static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index)
+{
+ rds_ibdev->vector_load[index]--;
+}
+
/*
* This needs to be very careful to not leave IS_ERR pointers around for
* cleanup to trip over.
@@ -383,7 +415,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
* completion queue and send queue. This extra space is used for FRMR
* registration and invalidation work requests
*/
- fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0);
+ fr_queue_space = rds_ibdev->use_fastreg ?
+ (RDS_IB_DEFAULT_FR_WR + 1) +
+ (RDS_IB_DEFAULT_FR_INV_WR + 1)
+ : 0;
/* add the conn now so that connection establishment has the dev */
rds_ib_add_conn(rds_ibdev, conn);
@@ -396,39 +431,44 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
/* Protection domain and memory range */
ic->i_pd = rds_ibdev->pd;
+ ic->i_scq_vector = ibdev_get_unused_vector(rds_ibdev);
cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1;
-
+ cq_attr.comp_vector = ic->i_scq_vector;
ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
rds_ib_cq_event_handler, conn,
&cq_attr);
if (IS_ERR(ic->i_send_cq)) {
ret = PTR_ERR(ic->i_send_cq);
ic->i_send_cq = NULL;
+ ibdev_put_vector(rds_ibdev, ic->i_scq_vector);
rdsdebug("ib_create_cq send failed: %d\n", ret);
- goto out;
+ goto rds_ibdev_out;
}
+ ic->i_rcq_vector = ibdev_get_unused_vector(rds_ibdev);
cq_attr.cqe = ic->i_recv_ring.w_nr;
+ cq_attr.comp_vector = ic->i_rcq_vector;
ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv,
rds_ib_cq_event_handler, conn,
&cq_attr);
if (IS_ERR(ic->i_recv_cq)) {
ret = PTR_ERR(ic->i_recv_cq);
ic->i_recv_cq = NULL;
+ ibdev_put_vector(rds_ibdev, ic->i_rcq_vector);
rdsdebug("ib_create_cq recv failed: %d\n", ret);
- goto out;
+ goto send_cq_out;
}
ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
if (ret) {
rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
- goto out;
+ goto recv_cq_out;
}
ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
if (ret) {
rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
- goto out;
+ goto recv_cq_out;
}
/* XXX negotiate max send/recv with remote? */
@@ -445,6 +485,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
attr.send_cq = ic->i_send_cq;
attr.recv_cq = ic->i_recv_cq;
atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
+ atomic_set(&ic->i_fastunreg_wrs, RDS_IB_DEFAULT_FR_INV_WR);
/*
* XXX this can fail if max_*_wr is too large? Are we supposed
@@ -453,7 +494,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
if (ret) {
rdsdebug("rdma_create_qp failed: %d\n", ret);
- goto out;
+ goto recv_cq_out;
}
ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
@@ -463,7 +504,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
if (!ic->i_send_hdrs) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent send failed\n");
- goto out;
+ goto qp_out;
}
ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
@@ -473,7 +514,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
if (!ic->i_recv_hdrs) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent recv failed\n");
- goto out;
+ goto send_hdrs_dma_out;
}
ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
@@ -481,7 +522,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
if (!ic->i_ack) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent ack failed\n");
- goto out;
+ goto recv_hdrs_dma_out;
}
ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
@@ -489,7 +530,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
if (!ic->i_sends) {
ret = -ENOMEM;
rdsdebug("send allocation failed\n");
- goto out;
+ goto ack_dma_out;
}
ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
@@ -497,7 +538,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
if (!ic->i_recvs) {
ret = -ENOMEM;
rdsdebug("recv allocation failed\n");
- goto out;
+ goto sends_out;
}
rds_ib_recv_init_ack(ic);
@@ -505,8 +546,33 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd,
ic->i_send_cq, ic->i_recv_cq);
-out:
+ return ret;
+
+sends_out:
+ vfree(ic->i_sends);
+ack_dma_out:
+ ib_dma_free_coherent(dev, sizeof(struct rds_header),
+ ic->i_ack, ic->i_ack_dma);
+recv_hdrs_dma_out:
+ ib_dma_free_coherent(dev, ic->i_recv_ring.w_nr *
+ sizeof(struct rds_header),
+ ic->i_recv_hdrs, ic->i_recv_hdrs_dma);
+send_hdrs_dma_out:
+ ib_dma_free_coherent(dev, ic->i_send_ring.w_nr *
+ sizeof(struct rds_header),
+ ic->i_send_hdrs, ic->i_send_hdrs_dma);
+qp_out:
+ rdma_destroy_qp(ic->i_cm_id);
+recv_cq_out:
+ if (!ib_destroy_cq(ic->i_recv_cq))
+ ic->i_recv_cq = NULL;
+send_cq_out:
+ if (!ib_destroy_cq(ic->i_send_cq))
+ ic->i_send_cq = NULL;
+rds_ibdev_out:
+ rds_ib_remove_conn(rds_ibdev, conn);
rds_ib_dev_put(rds_ibdev);
+
return ret;
}
@@ -682,6 +748,7 @@ out:
if (ic->i_cm_id == cm_id)
ret = 0;
}
+ ic->i_active_side = true;
return ret;
}
@@ -767,17 +834,27 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
wait_event(rds_ib_ring_empty_wait,
rds_ib_ring_empty(&ic->i_recv_ring) &&
(atomic_read(&ic->i_signaled_sends) == 0) &&
- (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
+ (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR) &&
+ (atomic_read(&ic->i_fastunreg_wrs) == RDS_IB_DEFAULT_FR_INV_WR));
tasklet_kill(&ic->i_send_tasklet);
tasklet_kill(&ic->i_recv_tasklet);
+ atomic_set(&ic->i_cq_quiesce, 1);
+
/* first destroy the ib state that generates callbacks */
if (ic->i_cm_id->qp)
rdma_destroy_qp(ic->i_cm_id);
- if (ic->i_send_cq)
+ if (ic->i_send_cq) {
+ if (ic->rds_ibdev)
+ ibdev_put_vector(ic->rds_ibdev, ic->i_scq_vector);
ib_destroy_cq(ic->i_send_cq);
- if (ic->i_recv_cq)
+ }
+
+ if (ic->i_recv_cq) {
+ if (ic->rds_ibdev)
+ ibdev_put_vector(ic->rds_ibdev, ic->i_rcq_vector);
ib_destroy_cq(ic->i_recv_cq);
+ }
/* then free the resources that ib callbacks use */
if (ic->i_send_hdrs)
@@ -855,6 +932,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
ic->i_sends = NULL;
vfree(ic->i_recvs);
ic->i_recvs = NULL;
+ ic->i_active_side = false;
}
int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index d921adc62765..48332a6ed738 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -104,14 +104,15 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
struct rds_ib_frmr *frmr = &ibmr->u.frmr;
struct ib_send_wr *failed_wr;
struct ib_reg_wr reg_wr;
- int ret;
+ int ret, off = 0;
while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
atomic_inc(&ibmr->ic->i_fastreg_wrs);
cpu_relax();
}
- ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, 0, PAGE_SIZE);
+ ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len,
+ &off, PAGE_SIZE);
if (unlikely(ret != ibmr->sg_len))
return ret < 0 ? ret : -EINVAL;
@@ -240,8 +241,8 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
if (frmr->fr_state != FRMR_IS_INUSE)
goto out;
- while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
- atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ while (atomic_dec_return(&ibmr->ic->i_fastunreg_wrs) <= 0) {
+ atomic_inc(&ibmr->ic->i_fastunreg_wrs);
cpu_relax();
}
@@ -260,7 +261,7 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
if (unlikely(ret)) {
frmr->fr_state = FRMR_IS_STALE;
frmr->fr_inv = false;
- atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ atomic_inc(&ibmr->ic->i_fastunreg_wrs);
pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
goto out;
}
@@ -288,9 +289,10 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
if (frmr->fr_inv) {
frmr->fr_state = FRMR_IS_FREE;
frmr->fr_inv = false;
+ atomic_inc(&ic->i_fastreg_wrs);
+ } else {
+ atomic_inc(&ic->i_fastunreg_wrs);
}
-
- atomic_inc(&ic->i_fastreg_wrs);
}
void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 1c754f4acbe5..5d6e98a79a5e 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -45,7 +45,6 @@
struct rds_ib_fmr {
struct ib_fmr *fmr;
- u64 *dma;
};
enum rds_ib_fr_state {
@@ -108,8 +107,6 @@ struct rds_ib_mr_pool {
};
extern struct workqueue_struct *rds_ib_mr_wq;
-extern unsigned int rds_ib_mr_1m_pool_size;
-extern unsigned int rds_ib_mr_8k_pool_size;
extern bool prefer_frmr;
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 606a11f681d2..e10624aa6959 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -194,6 +194,8 @@ static void rds_ib_frag_free(struct rds_ib_connection *ic,
rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
+ atomic_add(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
+ rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
}
/* Recycle inc after freeing attached frags */
@@ -261,6 +263,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i
atomic_dec(&rds_ib_allocation);
return NULL;
}
+ rds_ib_stats_inc(s_ib_rx_total_incs);
}
INIT_LIST_HEAD(&ibinc->ii_frags);
rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
@@ -278,6 +281,8 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
if (cache_item) {
frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
+ atomic_sub(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
+ rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
} else {
frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
if (!frag)
@@ -290,6 +295,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
kmem_cache_free(rds_ib_frag_slab, frag);
return NULL;
}
+ rds_ib_stats_inc(s_ib_rx_total_frags);
}
INIT_LIST_HEAD(&frag->f_item);
@@ -905,8 +911,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
ic->i_ibinc = ibinc;
hdr = &ibinc->ii_inc.i_hdr;
+ ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
+ local_clock();
memcpy(hdr, ihdr, sizeof(*hdr));
ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+ ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
+ local_clock();
rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
ic->i_recv_data_rem, hdr->h_flags);
@@ -980,8 +990,8 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
} else {
/* We expect errors as the qp is drained during shutdown */
if (rds_conn_up(conn) || rds_conn_connecting(conn))
- rds_ib_conn_error(conn, "recv completion on %pI4 had status %u (%s), disconnecting and reconnecting\n",
- &conn->c_faddr,
+ rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+ &conn->c_laddr, &conn->c_faddr,
wc->status,
ib_wc_status_msg(wc->status));
}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 84d90c97332f..6ab39dbcca01 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -69,16 +69,6 @@ static void rds_ib_send_complete(struct rds_message *rm,
complete(rm, notify_status);
}
-static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
- struct rm_data_op *op,
- int wc_status)
-{
- if (op->op_nents)
- ib_dma_unmap_sg(ic->i_cm_id->device,
- op->op_sg, op->op_nents,
- DMA_TO_DEVICE);
-}
-
static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
struct rm_rdma_op *op,
int wc_status)
@@ -139,6 +129,21 @@ static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
rds_ib_stats_inc(s_ib_atomic_fadd);
}
+static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
+ struct rm_data_op *op,
+ int wc_status)
+{
+ struct rds_message *rm = container_of(op, struct rds_message, data);
+
+ if (op->op_nents)
+ ib_dma_unmap_sg(ic->i_cm_id->device,
+ op->op_sg, op->op_nents,
+ DMA_TO_DEVICE);
+
+ if (rm->rdma.op_active && rm->data.op_notify)
+ rds_ib_send_unmap_rdma(ic, &rm->rdma, wc_status);
+}
+
/*
* Unmap the resources associated with a struct send_work.
*
@@ -300,8 +305,8 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
/* We expect errors as the qp is drained during shutdown */
if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
- rds_ib_conn_error(conn, "send completion on %pI4 had status %u (%s), disconnecting and reconnecting\n",
- &conn->c_faddr, wc->status,
+ rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+ &conn->c_laddr, &conn->c_faddr, wc->status,
ib_wc_status_msg(wc->status));
}
}
@@ -765,7 +770,6 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
if (work_alloc != 1) {
- rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
rds_ib_stats_inc(s_ib_tx_ring_full);
ret = -ENOMEM;
goto out;
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index 7e78dca1f252..9252ad126335 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -55,6 +55,8 @@ static const char *const rds_ib_stat_names[] = {
"ib_rx_refill_from_cq",
"ib_rx_refill_from_thread",
"ib_rx_alloc_limit",
+ "ib_rx_total_frags",
+ "ib_rx_total_incs",
"ib_rx_credit_updates",
"ib_ack_sent",
"ib_ack_send_failure",
diff --git a/net/rds/message.c b/net/rds/message.c
index 6cb91061556a..49bfb512d808 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -42,6 +42,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
[RDS_EXTHDR_NPATHS] = sizeof(u16),
+[RDS_EXTHDR_GEN_NUM] = sizeof(u32),
};
diff --git a/net/rds/page.c b/net/rds/page.c
index e2b5a5832d3d..7cc57e098ddb 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -45,35 +45,6 @@ struct rds_page_remainder {
static
DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
-/*
- * returns 0 on success or -errno on failure.
- *
- * We don't have to worry about flush_dcache_page() as this only works
- * with private pages. If, say, we were to do directed receive to pinned
- * user pages we'd have to worry more about cache coherence. (Though
- * the flush_dcache_page() in get_user_pages() would probably be enough).
- */
-int rds_page_copy_user(struct page *page, unsigned long offset,
- void __user *ptr, unsigned long bytes,
- int to_user)
-{
- unsigned long ret;
- void *addr;
-
- addr = kmap(page);
- if (to_user) {
- rds_stats_add(s_copy_to_user, bytes);
- ret = copy_to_user(ptr, addr + offset, bytes);
- } else {
- rds_stats_add(s_copy_from_user, bytes);
- ret = copy_from_user(addr + offset, ptr, bytes);
- }
- kunmap(page);
-
- return ret ? -EFAULT : 0;
-}
-EXPORT_SYMBOL_GPL(rds_page_copy_user);
-
/**
* rds_page_remainder_alloc - build up regions of a message.
*
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 4c93badeabf2..f06fac4886b0 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -40,7 +40,6 @@
/*
* XXX
* - build with sparse
- * - should we limit the size of a mr region? let transport return failure?
* - should we detect duplicate keys on a socket? hmm.
* - an rdma is an mlock, apply rlimit?
*/
@@ -135,7 +134,7 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
/* Release any MRs associated with this socket */
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
while ((node = rb_first(&rs->rs_rdma_keys))) {
- mr = container_of(node, struct rds_mr, r_rb_node);
+ mr = rb_entry(node, struct rds_mr, r_rb_node);
if (mr->r_trans == rs->rs_transport)
mr->r_invalidate = 0;
rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
@@ -200,6 +199,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
goto out;
}
+ /* Restrict the size of mr irrespective of underlying transport
+ * To account for unaligned mr regions, subtract one from nr_pages
+ */
+ if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) {
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
args->vec.addr, args->vec.bytes, nr_pages);
@@ -415,7 +422,8 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
if (!mr) {
- printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
+ pr_debug("rds: trying to unuse MR with unknown r_key %u!\n",
+ r_key);
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
return;
}
@@ -626,6 +634,16 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
}
op->op_notifier->n_user_token = args->user_token;
op->op_notifier->n_status = RDS_RDMA_SUCCESS;
+
+ /* Enable rmda notification on data operation for composite
+ * rds messages and make sure notification is enabled only
+ * for the data operation which follows it so that application
+ * gets notified only after full message gets delivered.
+ */
+ if (rm->data.op_sg) {
+ rm->rdma.op_notify = 0;
+ rm->data.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+ }
}
/* The cookie contains the R_Key of the remote memory region, and
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 345f09059e9f..fc59821f0a27 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -100,11 +100,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
trans->cm_connect_complete(conn, event);
break;
+ case RDMA_CM_EVENT_REJECTED:
+ rdsdebug("Connection rejected: %s\n",
+ rdma_reject_msg(cm_id, event->status));
+ /* FALLTHROUGH */
case RDMA_CM_EVENT_ADDR_ERROR:
case RDMA_CM_EVENT_ROUTE_ERROR:
case RDMA_CM_EVENT_CONNECT_ERROR:
case RDMA_CM_EVENT_UNREACHABLE:
- case RDMA_CM_EVENT_REJECTED:
case RDMA_CM_EVENT_DEVICE_REMOVAL:
case RDMA_CM_EVENT_ADDR_CHANGE:
if (conn)
@@ -203,18 +206,13 @@ static int rds_rdma_init(void)
{
int ret;
- ret = rds_rdma_listen_init();
+ ret = rds_ib_init();
if (ret)
goto out;
- ret = rds_ib_init();
+ ret = rds_rdma_listen_init();
if (ret)
- goto err_ib_init;
-
- goto out;
-
-err_ib_init:
- rds_rdma_listen_stop();
+ rds_ib_exit();
out:
return ret;
}
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 67ba67c058b1..82d38ccf5e8b 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -50,6 +50,9 @@ void rdsdebug(char *fmt, ...)
#define RDS_FRAG_SHIFT 12
#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT))
+/* Used to limit both RDMA and non-RDMA RDS message to 1MB */
+#define RDS_MAX_MSG_SIZE ((unsigned int)(1 << 20))
+
#define RDS_CONG_MAP_BYTES (65536 / 8)
#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
@@ -144,25 +147,28 @@ struct rds_connection {
/* Protocol version */
unsigned int c_version;
- possible_net_t c_net;
+ struct net *c_net;
struct list_head c_map_item;
unsigned long c_map_queued;
struct rds_conn_path c_path[RDS_MPATH_WORKERS];
wait_queue_head_t c_hs_waitq; /* handshake waitq */
+
+ u32 c_my_gen_num;
+ u32 c_peer_gen_num;
};
static inline
struct net *rds_conn_net(struct rds_connection *conn)
{
- return read_pnet(&conn->c_net);
+ return conn->c_net;
}
static inline
void rds_conn_net_set(struct rds_connection *conn, struct net *net)
{
- write_pnet(&conn->c_net, net);
+ conn->c_net = get_net(net);
}
#define RDS_FLAG_CONG_BITMAP 0x01
@@ -243,9 +249,15 @@ struct rds_ext_header_rdma_dest {
/* Extension header announcing number of paths.
* Implicit length = 2 bytes.
*/
-#define RDS_EXTHDR_NPATHS 4
+#define RDS_EXTHDR_NPATHS 5
+#define RDS_EXTHDR_GEN_NUM 6
#define __RDS_EXTHDR_MAX 16 /* for now */
+#define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
+#define RDS_MSG_RX_HDR 0
+#define RDS_MSG_RX_START 1
+#define RDS_MSG_RX_END 2
+#define RDS_MSG_RX_CMSG 3
struct rds_incoming {
atomic_t i_refcount;
@@ -258,6 +270,7 @@ struct rds_incoming {
rds_rdma_cookie_t i_rdma_cookie;
struct timeval i_rx_tstamp;
+ u64 i_rx_lat_trace[RDS_RX_MAX_TRACES];
};
struct rds_mr {
@@ -338,6 +351,7 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
#define RDS_MSG_RETRANSMITTED 5
#define RDS_MSG_MAPPED 6
#define RDS_MSG_PAGEVEC 7
+#define RDS_MSG_FLUSH 8
struct rds_message {
atomic_t m_refcount;
@@ -414,6 +428,7 @@ struct rds_message {
} rdma;
struct rm_data_op {
unsigned int op_active:1;
+ unsigned int op_notify:1;
unsigned int op_nents;
unsigned int op_count;
unsigned int op_dmasg;
@@ -566,6 +581,10 @@ struct rds_sock {
unsigned char rs_recverr,
rs_cong_monitor;
u32 rs_hash_initval;
+
+ /* Socket receive path trace points*/
+ u8 rs_rx_traces;
+ u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
};
static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
@@ -625,6 +644,9 @@ struct rds_statistics {
uint64_t s_cong_update_received;
uint64_t s_cong_send_error;
uint64_t s_cong_send_blocked;
+ uint64_t s_recv_bytes_added_to_socket;
+ uint64_t s_recv_bytes_removed_from_socket;
+
};
/* af_rds.c */
@@ -664,6 +686,7 @@ void rds_cong_exit(void);
struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
/* conn.c */
+extern u32 rds_gen_num;
int rds_conn_init(void);
void rds_conn_exit(void);
struct rds_connection *rds_conn_create(struct net *net,
@@ -683,10 +706,6 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
struct rds_info_lengths *lens,
int (*visitor)(struct rds_connection *, void *),
size_t item_len);
-__printf(2, 3)
-void __rds_conn_error(struct rds_connection *conn, const char *, ...);
-#define rds_conn_error(conn, fmt...) \
- __rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
__printf(2, 3)
void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...);
@@ -779,13 +798,6 @@ static inline int rds_message_verify_checksum(const struct rds_header *hdr)
/* page.c */
int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
gfp_t gfp);
-int rds_page_copy_user(struct page *page, unsigned long offset,
- void __user *ptr, unsigned long bytes,
- int to_user);
-#define rds_page_copy_to_user(page, offset, ptr, bytes) \
- rds_page_copy_user(page, offset, ptr, bytes, 1)
-#define rds_page_copy_from_user(page, offset, ptr, bytes) \
- rds_page_copy_user(page, offset, ptr, bytes, 0)
void rds_page_exit(void);
/* recv.c */
@@ -891,7 +903,7 @@ void rds_connect_path_complete(struct rds_conn_path *conn, int curr);
void rds_connect_complete(struct rds_connection *conn);
/* transport.c */
-int rds_trans_register(struct rds_transport *trans);
+void rds_trans_register(struct rds_transport *trans);
void rds_trans_unregister(struct rds_transport *trans);
struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr);
void rds_trans_put(struct rds_transport *trans);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index cbfabdf3ff48..8b7e7b7f2c2d 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -43,6 +43,8 @@
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
__be32 saddr)
{
+ int i;
+
atomic_set(&inc->i_refcount, 1);
INIT_LIST_HEAD(&inc->i_item);
inc->i_conn = conn;
@@ -50,6 +52,9 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
inc->i_rdma_cookie = 0;
inc->i_rx_tstamp.tv_sec = 0;
inc->i_rx_tstamp.tv_usec = 0;
+
+ for (i = 0; i < RDS_RX_MAX_TRACES; i++)
+ inc->i_rx_lat_trace[i] = 0;
}
EXPORT_SYMBOL_GPL(rds_inc_init);
@@ -94,6 +99,10 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
return;
rs->rs_rcv_bytes += delta;
+ if (delta > 0)
+ rds_stats_add(s_recv_bytes_added_to_socket, delta);
+ else
+ rds_stats_add(s_recv_bytes_removed_from_socket, -delta);
now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
@@ -120,6 +129,36 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
/* do nothing if no change in cong state */
}
+static void rds_conn_peer_gen_update(struct rds_connection *conn,
+ u32 peer_gen_num)
+{
+ int i;
+ struct rds_message *rm, *tmp;
+ unsigned long flags;
+
+ WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP);
+ if (peer_gen_num != 0) {
+ if (conn->c_peer_gen_num != 0 &&
+ peer_gen_num != conn->c_peer_gen_num) {
+ for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+ struct rds_conn_path *cp;
+
+ cp = &conn->c_path[i];
+ spin_lock_irqsave(&cp->cp_lock, flags);
+ cp->cp_next_tx_seq = 1;
+ cp->cp_next_rx_seq = 0;
+ list_for_each_entry_safe(rm, tmp,
+ &cp->cp_retrans,
+ m_conn_item) {
+ set_bit(RDS_MSG_FLUSH, &rm->m_flags);
+ }
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+ }
+ }
+ conn->c_peer_gen_num = peer_gen_num;
+ }
+}
+
/*
* Process all extension headers that come with this message.
*/
@@ -163,7 +202,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
union {
struct rds_ext_header_version version;
u16 rds_npaths;
+ u32 rds_gen_num;
} buffer;
+ u32 new_peer_gen_num = 0;
while (1) {
len = sizeof(buffer);
@@ -176,6 +217,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
conn->c_npaths = min_t(int, RDS_MPATH_WORKERS,
buffer.rds_npaths);
break;
+ case RDS_EXTHDR_GEN_NUM:
+ new_peer_gen_num = buffer.rds_gen_num;
+ break;
default:
pr_warn_ratelimited("ignoring unknown exthdr type "
"0x%x\n", type);
@@ -183,6 +227,7 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
}
/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
conn->c_npaths = max_t(int, conn->c_npaths, 1);
+ rds_conn_peer_gen_update(conn, new_peer_gen_num);
}
/* rds_start_mprds() will synchronously start multiple paths when appropriate.
@@ -333,6 +378,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
if (sock_flag(sk, SOCK_RCVTSTAMP))
do_gettimeofday(&inc->i_rx_tstamp);
rds_inc_addref(inc);
+ inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
list_add_tail(&inc->i_item, &rs->rs_recv_queue);
__rds_wake_sk_sleep(sk);
} else {
@@ -494,7 +540,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
if (ret)
- return ret;
+ goto out;
}
if ((inc->i_rx_tstamp.tv_sec != 0) &&
@@ -503,10 +549,30 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
sizeof(struct timeval),
&inc->i_rx_tstamp);
if (ret)
- return ret;
+ goto out;
}
- return 0;
+ if (rs->rs_rx_traces) {
+ struct rds_cmsg_rx_trace t;
+ int i, j;
+
+ inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
+ t.rx_traces = rs->rs_rx_traces;
+ for (i = 0; i < rs->rs_rx_traces; i++) {
+ j = rs->rs_rx_trace[i];
+ t.rx_trace_pos[i] = j;
+ t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] -
+ inc->i_rx_lat_trace[j];
+ }
+
+ ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY,
+ sizeof(t), &t);
+ if (ret)
+ goto out;
+ }
+
+out:
+ return ret;
}
int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/net/rds/send.c b/net/rds/send.c
index 896626b9a0ef..5cc64039caf7 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -259,8 +259,9 @@ restart:
* connection.
* Therefore, we never retransmit messages with RDMA ops.
*/
- if (rm->rdma.op_active &&
- test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
+ if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) ||
+ (rm->rdma.op_active &&
+ test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) {
spin_lock_irqsave(&cp->cp_lock, flags);
if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
list_move(&rm->m_conn_item, &to_be_dropped);
@@ -475,12 +476,14 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
struct rm_rdma_op *ro;
struct rds_notifier *notifier;
unsigned long flags;
+ unsigned int notify = 0;
spin_lock_irqsave(&rm->m_rs_lock, flags);
+ notify = rm->rdma.op_notify | rm->data.op_notify;
ro = &rm->rdma;
if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
- ro->op_active && ro->op_notify && ro->op_notifier) {
+ ro->op_active && notify && ro->op_notifier) {
notifier = ro->op_notifier;
rs = rm->m_rs;
sock_hold(rds_rs_to_sk(rs));
@@ -944,6 +947,11 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
ret = rds_cmsg_rdma_map(rs, rm, cmsg);
if (!ret)
*allocated_mr = 1;
+ else if (ret == -ENODEV)
+ /* Accommodate the get_mr() case which can fail
+ * if connection isn't established yet.
+ */
+ ret = -EAGAIN;
break;
case RDS_CMSG_ATOMIC_CSWP:
case RDS_CMSG_ATOMIC_FADD:
@@ -986,6 +994,26 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn)
return hash;
}
+static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
+{
+ struct rds_rdma_args *args;
+ struct cmsghdr *cmsg;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_RDS)
+ continue;
+
+ if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
+ args = CMSG_DATA(cmsg);
+ *rdma_bytes += args->remote_vec.bytes;
+ }
+ }
+ return 0;
+}
+
int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
{
struct sock *sk = sock->sk;
@@ -1000,6 +1028,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
int nonblock = msg->msg_flags & MSG_DONTWAIT;
long timeo = sock_sndtimeo(sk, nonblock);
struct rds_conn_path *cpath;
+ size_t total_payload_len = payload_len, rdma_payload_len = 0;
/* Mirror Linux UDP mirror of BSD error message compatibility */
/* XXX: Perhaps MSG_MORE someday */
@@ -1032,6 +1061,16 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
}
release_sock(sk);
+ ret = rds_rdma_bytes(msg, &rdma_payload_len);
+ if (ret)
+ goto out;
+
+ total_payload_len += rdma_payload_len;
+ if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) {
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
if (payload_len > rds_sk_sndbuf(rs)) {
ret = -EMSGSIZE;
goto out;
@@ -1081,8 +1120,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
/* Parse any control messages the user may have included. */
ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
- if (ret)
+ if (ret) {
+ /* Trigger connection so that its ready for the next retry */
+ if (ret == -EAGAIN)
+ rds_conn_connect_if_down(conn);
goto out;
+ }
if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
@@ -1168,7 +1211,7 @@ out:
* or
* RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED
*/
-int
+static int
rds_send_probe(struct rds_conn_path *cp, __be16 sport,
__be16 dport, u8 h_flags)
{
@@ -1209,6 +1252,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_NPATHS, &npaths,
sizeof(npaths));
+ rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_GEN_NUM,
+ &cp->cp_conn->c_my_gen_num,
+ sizeof(u32));
}
spin_unlock_irqrestore(&cp->cp_lock, flags);
@@ -1233,7 +1280,7 @@ rds_send_pong(struct rds_conn_path *cp, __be16 dport)
return rds_send_probe(cp, 0, dport, 0);
}
-void
+static void
rds_send_ping(struct rds_connection *conn)
{
unsigned long flags;
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 20e2923dc827..225690076773 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -220,7 +220,7 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
write_unlock_bh(&sock->sk->sk_callback_lock);
}
-static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
+static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
{
@@ -229,6 +229,7 @@ static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
unsigned long flags;
struct sockaddr_in sin;
int sinlen;
+ struct socket *sock;
spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
@@ -237,12 +238,17 @@ static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
- sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0);
- tsinfo.local_addr = sin.sin_addr.s_addr;
- tsinfo.local_port = sin.sin_port;
- sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1);
- tsinfo.peer_addr = sin.sin_addr.s_addr;
- tsinfo.peer_port = sin.sin_port;
+ sock = tc->t_sock;
+ if (sock) {
+ sock->ops->getname(sock, (struct sockaddr *)&sin,
+ &sinlen, 0);
+ tsinfo.local_addr = sin.sin_addr.s_addr;
+ tsinfo.local_port = sin.sin_port;
+ sock->ops->getname(sock, (struct sockaddr *)&sin,
+ &sinlen, 1);
+ tsinfo.peer_addr = sin.sin_addr.s_addr;
+ tsinfo.peer_port = sin.sin_port;
+ }
tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
tsinfo.data_rem = tc->t_tinc_data_rem;
@@ -360,7 +366,7 @@ struct rds_transport rds_tcp_transport = {
.t_mp_capable = 1,
};
-static int rds_tcp_netid;
+static unsigned int rds_tcp_netid;
/* per-network namespace private data for this module */
struct rds_tcp_net {
@@ -478,9 +484,10 @@ static void __net_exit rds_tcp_exit_net(struct net *net)
* we do need to clean up the listen socket here.
*/
if (rtn->rds_tcp_listen_sock) {
- rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
+ struct socket *lsock = rtn->rds_tcp_listen_sock;
+
rtn->rds_tcp_listen_sock = NULL;
- flush_work(&rtn->rds_tcp_accept_w);
+ rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
}
}
@@ -517,13 +524,13 @@ static void rds_tcp_kill_sock(struct net *net)
struct rds_tcp_connection *tc, *_tc;
LIST_HEAD(tmp_list);
struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+ struct socket *lsock = rtn->rds_tcp_listen_sock;
- rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
rtn->rds_tcp_listen_sock = NULL;
- flush_work(&rtn->rds_tcp_accept_w);
+ rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
spin_lock_irq(&rds_tcp_conn_lock);
list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
- struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
+ struct net *c_net = tc->t_cpath->cp_conn->c_net;
if (net != c_net || !tc->t_sock)
continue;
@@ -540,8 +547,12 @@ static void rds_tcp_kill_sock(struct net *net)
void *rds_tcp_listen_sock_def_readable(struct net *net)
{
struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+ struct socket *lsock = rtn->rds_tcp_listen_sock;
- return rtn->rds_tcp_listen_sock->sk->sk_user_data;
+ if (!lsock)
+ return NULL;
+
+ return lsock->sk->sk_user_data;
}
static int rds_tcp_dev_event(struct notifier_block *this,
@@ -578,7 +589,7 @@ static void rds_tcp_sysctl_reset(struct net *net)
spin_lock_irq(&rds_tcp_conn_lock);
list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
- struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
+ struct net *c_net = tc->t_cpath->cp_conn->c_net;
if (net != c_net || !tc->t_sock)
continue;
@@ -632,35 +643,31 @@ static int rds_tcp_init(void)
goto out;
}
- ret = register_netdevice_notifier(&rds_tcp_dev_notifier);
- if (ret) {
- pr_warn("could not register rds_tcp_dev_notifier\n");
- goto out;
- }
-
- ret = register_pernet_subsys(&rds_tcp_net_ops);
+ ret = rds_tcp_recv_init();
if (ret)
goto out_slab;
- ret = rds_tcp_recv_init();
+ ret = register_pernet_subsys(&rds_tcp_net_ops);
if (ret)
+ goto out_recv;
+
+ ret = register_netdevice_notifier(&rds_tcp_dev_notifier);
+ if (ret) {
+ pr_warn("could not register rds_tcp_dev_notifier\n");
goto out_pernet;
+ }
- ret = rds_trans_register(&rds_tcp_transport);
- if (ret)
- goto out_recv;
+ rds_trans_register(&rds_tcp_transport);
rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
goto out;
-out_recv:
- rds_tcp_recv_exit();
out_pernet:
unregister_pernet_subsys(&rds_tcp_net_ops);
+out_recv:
+ rds_tcp_recv_exit();
out_slab:
- if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
- pr_warn("could not unregister rds_tcp_dev_notifier\n");
kmem_cache_destroy(rds_tcp_conn_slab);
out:
return ret;
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 9a1cc8906576..56ea6620fcf9 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -66,7 +66,7 @@ void rds_tcp_state_change(struct sock *sk);
/* tcp_listen.c */
struct socket *rds_tcp_listen_init(struct net *);
-void rds_tcp_listen_stop(struct socket *);
+void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
void rds_tcp_listen_data_ready(struct sock *sk);
int rds_tcp_accept_one(struct socket *sock);
int rds_tcp_keepalive(struct socket *sock);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 05f61c533ed3..d6839d96d539 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -60,7 +60,19 @@ void rds_tcp_state_change(struct sock *sk)
case TCP_SYN_RECV:
break;
case TCP_ESTABLISHED:
- rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
+ /* Force the peer to reconnect so that we have the
+ * TCP ports going from <smaller-ip>.<transient> to
+ * <larger-ip>.<RDS_TCP_PORT>. We avoid marking the
+ * RDS connection as RDS_CONN_UP until the reconnect,
+ * to avoid RDS datagram loss.
+ */
+ if (cp->cp_conn->c_laddr > cp->cp_conn->c_faddr &&
+ rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
+ RDS_CONN_ERROR)) {
+ rds_conn_path_drop(cp);
+ } else {
+ rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
+ }
break;
case TCP_CLOSE_WAIT:
case TCP_CLOSE:
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index e0b23fb5b8d5..507678853e6c 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -79,31 +79,27 @@ bail:
* smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side
* by moving them to CONNECTING in this function.
*/
+static
struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
{
int i;
bool peer_is_smaller = (conn->c_faddr < conn->c_laddr);
- int npaths = conn->c_npaths;
-
- if (npaths <= 1) {
- struct rds_conn_path *cp = &conn->c_path[0];
- int ret;
-
- ret = rds_conn_path_transition(cp, RDS_CONN_DOWN,
- RDS_CONN_CONNECTING);
- if (!ret)
- rds_conn_path_transition(cp, RDS_CONN_ERROR,
- RDS_CONN_CONNECTING);
- return cp->cp_transport_data;
- }
+ int npaths = max_t(int, 1, conn->c_npaths);
- /* for mprds, paths with cp_index > 0 MUST be initiated by the peer
+ /* for mprds, all paths MUST be initiated by the peer
* with the smaller address.
*/
- if (!peer_is_smaller)
+ if (!peer_is_smaller) {
+ /* Make sure we initiate at least one path if this
+ * has not already been done; rds_start_mprds() will
+ * take care of additional paths, if necessary.
+ */
+ if (npaths == 1)
+ rds_conn_path_connect_if_down(&conn->c_path[0]);
return NULL;
+ }
- for (i = 1; i < npaths; i++) {
+ for (i = 0; i < npaths; i++) {
struct rds_conn_path *cp = &conn->c_path[i];
if (rds_conn_path_transition(cp, RDS_CONN_DOWN,
@@ -137,7 +133,7 @@ int rds_tcp_accept_one(struct socket *sock)
new_sock->type = sock->type;
new_sock->ops = sock->ops;
- ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
+ ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, true);
if (ret < 0)
goto out;
@@ -171,8 +167,8 @@ int rds_tcp_accept_one(struct socket *sock)
mutex_lock(&rs_tcp->t_conn_path_lock);
cp = rs_tcp->t_cpath;
conn_state = rds_conn_path_state(cp);
- if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP &&
- conn_state != RDS_CONN_ERROR)
+ WARN_ON(conn_state == RDS_CONN_UP);
+ if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR)
goto rst_nsk;
if (rs_tcp->t_sock) {
/* Need to resolve a duelling SYN between peers.
@@ -227,6 +223,9 @@ void rds_tcp_listen_data_ready(struct sock *sk)
* before it has been accepted and the accepter has set up their
* data_ready.. we only want to queue listen work for our listening
* socket
+ *
+ * (*ready)() may be null if we are racing with netns delete, and
+ * the listen socket is being torn down.
*/
if (sk->sk_state == TCP_LISTEN)
rds_tcp_accept_work(sk);
@@ -235,7 +234,8 @@ void rds_tcp_listen_data_ready(struct sock *sk)
out:
read_unlock_bh(&sk->sk_callback_lock);
- ready(sk);
+ if (ready)
+ ready(sk);
}
struct socket *rds_tcp_listen_init(struct net *net)
@@ -275,7 +275,7 @@ out:
return NULL;
}
-void rds_tcp_listen_stop(struct socket *sock)
+void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor)
{
struct sock *sk;
@@ -296,5 +296,6 @@ void rds_tcp_listen_stop(struct socket *sock)
/* wait for accepts to stop and close the socket */
flush_workqueue(rds_wq);
+ flush_work(acceptor);
sock_release(sock);
}
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index ad4892e97f91..e006ef8e6d40 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -180,6 +180,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
rdsdebug("alloced tinc %p\n", tinc);
rds_inc_path_init(&tinc->ti_inc, cp,
cp->cp_conn->c_faddr);
+ tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
+ local_clock();
+
/*
* XXX * we might be able to use the __ variants when
* we've already serialized at a higher level.
@@ -204,6 +207,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
/* could be 0 for a 0 len message */
tc->t_tinc_data_rem =
be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
+ tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
+ local_clock();
}
}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 89d09b481f47..dcf4742083ea 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -100,6 +100,9 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags);
tc->t_last_expected_una = rm->m_ack_seq + 1;
+ if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+ rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+
rdsdebug("rm %p tcp nxt %u ack_seq %llu\n",
rm, rds_tcp_snd_nxt(tc),
(unsigned long long)rm->m_ack_seq);
diff --git a/net/rds/threads.c b/net/rds/threads.c
index e42df11bf30a..e36e333a0aa0 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -171,8 +171,7 @@ void rds_connect_worker(struct work_struct *work)
RDS_CONN_DOWN))
rds_queue_reconnect(cp);
else
- rds_conn_path_error(cp,
- "RDS: connect failed\n");
+ rds_conn_path_error(cp, "connect failed\n");
}
}
}
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 2ffd3e30c643..0b188dd0a344 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -40,7 +40,7 @@
static struct rds_transport *transports[RDS_TRANS_COUNT];
static DECLARE_RWSEM(rds_trans_sem);
-int rds_trans_register(struct rds_transport *trans)
+void rds_trans_register(struct rds_transport *trans)
{
BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ);
@@ -55,8 +55,6 @@ int rds_trans_register(struct rds_transport *trans)
}
up_write(&rds_trans_sem);
-
- return 0;
}
EXPORT_SYMBOL_GPL(rds_trans_register);
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
index 868f1ad0415a..060600b03fad 100644
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -23,17 +23,6 @@ config RFKILL_INPUT
depends on INPUT = y || RFKILL = INPUT
default y if !EXPERT
-config RFKILL_REGULATOR
- tristate "Generic rfkill regulator driver"
- depends on RFKILL || !RFKILL
- depends on REGULATOR
- help
- This options enable controlling radio transmitters connected to
- voltage regulator using the regulator framework.
-
- To compile this driver as a module, choose M here: the module will
- be called rfkill-regulator.
-
config RFKILL_GPIO
tristate "GPIO RFKILL driver"
depends on RFKILL
diff --git a/net/rfkill/Makefile b/net/rfkill/Makefile
index 311768783f4a..87a80aded0b3 100644
--- a/net/rfkill/Makefile
+++ b/net/rfkill/Makefile
@@ -5,5 +5,4 @@
rfkill-y += core.o
rfkill-$(CONFIG_RFKILL_INPUT) += input.o
obj-$(CONFIG_RFKILL) += rfkill.o
-obj-$(CONFIG_RFKILL_REGULATOR) += rfkill-regulator.o
obj-$(CONFIG_RFKILL_GPIO) += rfkill-gpio.o
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 884027f62783..2064c3a35ef8 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -176,6 +176,50 @@ static void rfkill_led_trigger_unregister(struct rfkill *rfkill)
{
led_trigger_unregister(&rfkill->led_trigger);
}
+
+static struct led_trigger rfkill_any_led_trigger;
+static struct work_struct rfkill_any_work;
+
+static void rfkill_any_led_trigger_worker(struct work_struct *work)
+{
+ enum led_brightness brightness = LED_OFF;
+ struct rfkill *rfkill;
+
+ mutex_lock(&rfkill_global_mutex);
+ list_for_each_entry(rfkill, &rfkill_list, node) {
+ if (!(rfkill->state & RFKILL_BLOCK_ANY)) {
+ brightness = LED_FULL;
+ break;
+ }
+ }
+ mutex_unlock(&rfkill_global_mutex);
+
+ led_trigger_event(&rfkill_any_led_trigger, brightness);
+}
+
+static void rfkill_any_led_trigger_event(void)
+{
+ schedule_work(&rfkill_any_work);
+}
+
+static void rfkill_any_led_trigger_activate(struct led_classdev *led_cdev)
+{
+ rfkill_any_led_trigger_event();
+}
+
+static int rfkill_any_led_trigger_register(void)
+{
+ INIT_WORK(&rfkill_any_work, rfkill_any_led_trigger_worker);
+ rfkill_any_led_trigger.name = "rfkill-any";
+ rfkill_any_led_trigger.activate = rfkill_any_led_trigger_activate;
+ return led_trigger_register(&rfkill_any_led_trigger);
+}
+
+static void rfkill_any_led_trigger_unregister(void)
+{
+ led_trigger_unregister(&rfkill_any_led_trigger);
+ cancel_work_sync(&rfkill_any_work);
+}
#else
static void rfkill_led_trigger_event(struct rfkill *rfkill)
{
@@ -189,6 +233,19 @@ static inline int rfkill_led_trigger_register(struct rfkill *rfkill)
static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill)
{
}
+
+static void rfkill_any_led_trigger_event(void)
+{
+}
+
+static int rfkill_any_led_trigger_register(void)
+{
+ return 0;
+}
+
+static void rfkill_any_led_trigger_unregister(void)
+{
+}
#endif /* CONFIG_RFKILL_LEDS */
static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill,
@@ -297,6 +354,7 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
spin_unlock_irqrestore(&rfkill->lock, flags);
rfkill_led_trigger_event(rfkill);
+ rfkill_any_led_trigger_event();
if (prev != curr)
rfkill_event(rfkill);
@@ -477,11 +535,9 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
spin_unlock_irqrestore(&rfkill->lock, flags);
rfkill_led_trigger_event(rfkill);
+ rfkill_any_led_trigger_event();
- if (!rfkill->registered)
- return ret;
-
- if (prev != blocked)
+ if (rfkill->registered && prev != blocked)
schedule_work(&rfkill->uevent_work);
return ret;
@@ -523,6 +579,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
schedule_work(&rfkill->uevent_work);
rfkill_led_trigger_event(rfkill);
+ rfkill_any_led_trigger_event();
return blocked;
}
@@ -572,6 +629,7 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
schedule_work(&rfkill->uevent_work);
rfkill_led_trigger_event(rfkill);
+ rfkill_any_led_trigger_event();
}
}
EXPORT_SYMBOL(rfkill_set_states);
@@ -988,6 +1046,7 @@ int __must_check rfkill_register(struct rfkill *rfkill)
#endif
}
+ rfkill_any_led_trigger_event();
rfkill_send_events(rfkill, RFKILL_OP_ADD);
mutex_unlock(&rfkill_global_mutex);
@@ -1020,6 +1079,7 @@ void rfkill_unregister(struct rfkill *rfkill)
mutex_lock(&rfkill_global_mutex);
rfkill_send_events(rfkill, RFKILL_OP_DEL);
list_del_init(&rfkill->node);
+ rfkill_any_led_trigger_event();
mutex_unlock(&rfkill_global_mutex);
rfkill_led_trigger_unregister(rfkill);
@@ -1266,24 +1326,33 @@ static int __init rfkill_init(void)
error = class_register(&rfkill_class);
if (error)
- goto out;
+ goto error_class;
error = misc_register(&rfkill_miscdev);
- if (error) {
- class_unregister(&rfkill_class);
- goto out;
- }
+ if (error)
+ goto error_misc;
+
+ error = rfkill_any_led_trigger_register();
+ if (error)
+ goto error_led_trigger;
#ifdef CONFIG_RFKILL_INPUT
error = rfkill_handler_init();
- if (error) {
- misc_deregister(&rfkill_miscdev);
- class_unregister(&rfkill_class);
- goto out;
- }
+ if (error)
+ goto error_input;
#endif
- out:
+ return 0;
+
+#ifdef CONFIG_RFKILL_INPUT
+error_input:
+ rfkill_any_led_trigger_unregister();
+#endif
+error_led_trigger:
+ misc_deregister(&rfkill_miscdev);
+error_misc:
+ class_unregister(&rfkill_class);
+error_class:
return error;
}
subsys_initcall(rfkill_init);
@@ -1293,6 +1362,7 @@ static void __exit rfkill_exit(void)
#ifdef CONFIG_RFKILL_INPUT
rfkill_handler_exit();
#endif
+ rfkill_any_led_trigger_unregister();
misc_deregister(&rfkill_miscdev);
class_unregister(&rfkill_class);
}
diff --git a/net/rfkill/rfkill-regulator.c b/net/rfkill/rfkill-regulator.c
deleted file mode 100644
index 50cd26a48e87..000000000000
--- a/net/rfkill/rfkill-regulator.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * rfkill-regulator.c - Regulator consumer driver for rfkill
- *
- * Copyright (C) 2009 Guiming Zhuo <gmzhuo@gmail.com>
- * Copyright (C) 2011 Antonio Ospite <ospite@studenti.unina.it>
- *
- * Implementation inspired by leds-regulator driver.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/module.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/platform_device.h>
-#include <linux/regulator/consumer.h>
-#include <linux/rfkill.h>
-#include <linux/rfkill-regulator.h>
-
-struct rfkill_regulator_data {
- struct rfkill *rf_kill;
- bool reg_enabled;
-
- struct regulator *vcc;
-};
-
-static int rfkill_regulator_set_block(void *data, bool blocked)
-{
- struct rfkill_regulator_data *rfkill_data = data;
- int ret = 0;
-
- pr_debug("%s: blocked: %d\n", __func__, blocked);
-
- if (blocked) {
- if (rfkill_data->reg_enabled) {
- regulator_disable(rfkill_data->vcc);
- rfkill_data->reg_enabled = false;
- }
- } else {
- if (!rfkill_data->reg_enabled) {
- ret = regulator_enable(rfkill_data->vcc);
- if (!ret)
- rfkill_data->reg_enabled = true;
- }
- }
-
- pr_debug("%s: regulator_is_enabled after set_block: %d\n", __func__,
- regulator_is_enabled(rfkill_data->vcc));
-
- return ret;
-}
-
-static struct rfkill_ops rfkill_regulator_ops = {
- .set_block = rfkill_regulator_set_block,
-};
-
-static int rfkill_regulator_probe(struct platform_device *pdev)
-{
- struct rfkill_regulator_platform_data *pdata = pdev->dev.platform_data;
- struct rfkill_regulator_data *rfkill_data;
- struct regulator *vcc;
- struct rfkill *rf_kill;
- int ret = 0;
-
- if (pdata == NULL) {
- dev_err(&pdev->dev, "no platform data\n");
- return -ENODEV;
- }
-
- if (pdata->name == NULL || pdata->type == 0) {
- dev_err(&pdev->dev, "invalid name or type in platform data\n");
- return -EINVAL;
- }
-
- vcc = regulator_get_exclusive(&pdev->dev, "vrfkill");
- if (IS_ERR(vcc)) {
- dev_err(&pdev->dev, "Cannot get vcc for %s\n", pdata->name);
- ret = PTR_ERR(vcc);
- goto out;
- }
-
- rfkill_data = kzalloc(sizeof(*rfkill_data), GFP_KERNEL);
- if (rfkill_data == NULL) {
- ret = -ENOMEM;
- goto err_data_alloc;
- }
-
- rf_kill = rfkill_alloc(pdata->name, &pdev->dev,
- pdata->type,
- &rfkill_regulator_ops, rfkill_data);
- if (rf_kill == NULL) {
- ret = -ENOMEM;
- goto err_rfkill_alloc;
- }
-
- if (regulator_is_enabled(vcc)) {
- dev_dbg(&pdev->dev, "Regulator already enabled\n");
- rfkill_data->reg_enabled = true;
- }
- rfkill_data->vcc = vcc;
- rfkill_data->rf_kill = rf_kill;
-
- ret = rfkill_register(rf_kill);
- if (ret) {
- dev_err(&pdev->dev, "Cannot register rfkill device\n");
- goto err_rfkill_register;
- }
-
- platform_set_drvdata(pdev, rfkill_data);
- dev_info(&pdev->dev, "%s initialized\n", pdata->name);
-
- return 0;
-
-err_rfkill_register:
- rfkill_destroy(rf_kill);
-err_rfkill_alloc:
- kfree(rfkill_data);
-err_data_alloc:
- regulator_put(vcc);
-out:
- return ret;
-}
-
-static int rfkill_regulator_remove(struct platform_device *pdev)
-{
- struct rfkill_regulator_data *rfkill_data = platform_get_drvdata(pdev);
- struct rfkill *rf_kill = rfkill_data->rf_kill;
-
- rfkill_unregister(rf_kill);
- rfkill_destroy(rf_kill);
- regulator_put(rfkill_data->vcc);
- kfree(rfkill_data);
-
- return 0;
-}
-
-static struct platform_driver rfkill_regulator_driver = {
- .probe = rfkill_regulator_probe,
- .remove = rfkill_regulator_remove,
- .driver = {
- .name = "rfkill-regulator",
- },
-};
-
-module_platform_driver(rfkill_regulator_driver);
-
-MODULE_AUTHOR("Guiming Zhuo <gmzhuo@gmail.com>");
-MODULE_AUTHOR("Antonio Ospite <ospite@studenti.unina.it>");
-MODULE_DESCRIPTION("Regulator consumer driver for rfkill");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:rfkill-regulator");
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 129d357d2722..4a9729257023 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -20,7 +20,7 @@
#include <linux/in.h>
#include <linux/slab.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/spinlock.h>
#include <linux/timer.h>
#include <linux/string.h>
@@ -34,7 +34,7 @@
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/termios.h>
#include <linux/mm.h>
@@ -871,7 +871,8 @@ out_release:
return err;
}
-static int rose_accept(struct socket *sock, struct socket *newsock, int flags)
+static int rose_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern)
{
struct sk_buff *skb;
struct sock *newsk;
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index 0fc76d845103..452bbb38d943 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -25,7 +25,7 @@
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/termios.h> /* For TIOCINQ/OUTQ */
#include <linux/mm.h>
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index 8fc6ea347182..b9da4d6b914f 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -2,7 +2,9 @@
# Makefile for Linux kernel RxRPC
#
-af-rxrpc-y := \
+obj-$(CONFIG_AF_RXRPC) += rxrpc.o
+
+rxrpc-y := \
af_rxrpc.o \
call_accept.o \
call_event.o \
@@ -26,8 +28,6 @@ af-rxrpc-y := \
skbuff.o \
utils.o
-af-rxrpc-$(CONFIG_PROC_FS) += proc.o
-af-rxrpc-$(CONFIG_RXKAD) += rxkad.o
-af-rxrpc-$(CONFIG_SYSCTL) += sysctl.o
-
-obj-$(CONFIG_AF_RXRPC) += af-rxrpc.o
+rxrpc-$(CONFIG_PROC_FS) += proc.o
+rxrpc-$(CONFIG_RXKAD) += rxkad.o
+rxrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 2d59c9be40e1..7fb59c3f1542 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -224,6 +224,14 @@ static int rxrpc_listen(struct socket *sock, int backlog)
else
sk->sk_max_ack_backlog = old;
break;
+ case RXRPC_SERVER_LISTENING:
+ if (backlog == 0) {
+ rx->sk.sk_state = RXRPC_SERVER_LISTEN_DISABLED;
+ sk->sk_max_ack_backlog = 0;
+ rxrpc_discard_prealloc(rx);
+ ret = 0;
+ break;
+ }
default:
ret = -EBUSY;
break;
@@ -282,10 +290,11 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
cp.exclusive = false;
cp.service_id = srx->srx_service;
call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, gfp);
+ /* The socket has been unlocked. */
if (!IS_ERR(call))
call->notify_rx = notify_rx;
- release_sock(&rx->sk);
+ mutex_unlock(&call->user_mutex);
_leave(" = %p", call);
return call;
}
@@ -302,7 +311,10 @@ EXPORT_SYMBOL(rxrpc_kernel_begin_call);
void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call)
{
_enter("%d{%d}", call->debug_id, atomic_read(&call->usage));
+
+ mutex_lock(&call->user_mutex);
rxrpc_release_call(rxrpc_sk(sock->sk), call);
+ mutex_unlock(&call->user_mutex);
rxrpc_put_call(call, rxrpc_call_put_kernel);
}
EXPORT_SYMBOL(rxrpc_kernel_end_call);
@@ -442,14 +454,16 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
case RXRPC_SERVER_BOUND:
case RXRPC_SERVER_LISTENING:
ret = rxrpc_do_sendmsg(rx, m, len);
- break;
+ /* The socket has been unlocked */
+ goto out;
default:
ret = -EINVAL;
- break;
+ goto error_unlock;
}
error_unlock:
release_sock(&rx->sk);
+out:
_leave(" = %d", ret);
return ret;
}
@@ -762,16 +776,17 @@ static const struct net_proto_family rxrpc_family_ops = {
static int __init af_rxrpc_init(void)
{
int ret = -1;
+ unsigned int tmp;
BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb));
get_random_bytes(&rxrpc_epoch, sizeof(rxrpc_epoch));
rxrpc_epoch |= RXRPC_RANDOM_EPOCH;
- get_random_bytes(&rxrpc_client_conn_ids.cur,
- sizeof(rxrpc_client_conn_ids.cur));
- rxrpc_client_conn_ids.cur &= 0x3fffffff;
- if (rxrpc_client_conn_ids.cur == 0)
- rxrpc_client_conn_ids.cur = 1;
+ get_random_bytes(&tmp, sizeof(tmp));
+ tmp &= 0x3fffffff;
+ if (tmp == 0)
+ tmp = 1;
+ idr_set_cursor(&rxrpc_client_conn_ids, tmp);
ret = -ENOMEM;
rxrpc_call_jar = kmem_cache_create(
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index f60e35576526..26a7b1db1361 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -60,6 +60,7 @@ enum {
RXRPC_CLIENT_BOUND, /* client local address bound */
RXRPC_SERVER_BOUND, /* server local address bound */
RXRPC_SERVER_LISTENING, /* server listening for connections */
+ RXRPC_SERVER_LISTEN_DISABLED, /* server listening disabled */
RXRPC_CLOSE, /* socket is being closed */
};
@@ -466,6 +467,7 @@ struct rxrpc_call {
struct rxrpc_connection *conn; /* connection carrying call */
struct rxrpc_peer *peer; /* Peer record for remote address */
struct rxrpc_sock __rcu *socket; /* socket responsible */
+ struct mutex user_mutex; /* User access mutex */
ktime_t ack_at; /* When deferred ACK needs to happen */
ktime_t resend_at; /* When next resend needs to happen */
ktime_t ping_at; /* When next to send a ping */
@@ -593,200 +595,6 @@ struct rxrpc_ack_summary {
u8 cumulative_acks;
};
-enum rxrpc_skb_trace {
- rxrpc_skb_rx_cleaned,
- rxrpc_skb_rx_freed,
- rxrpc_skb_rx_got,
- rxrpc_skb_rx_lost,
- rxrpc_skb_rx_received,
- rxrpc_skb_rx_rotated,
- rxrpc_skb_rx_purged,
- rxrpc_skb_rx_seen,
- rxrpc_skb_tx_cleaned,
- rxrpc_skb_tx_freed,
- rxrpc_skb_tx_got,
- rxrpc_skb_tx_new,
- rxrpc_skb_tx_rotated,
- rxrpc_skb_tx_seen,
- rxrpc_skb__nr_trace
-};
-
-extern const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7];
-
-enum rxrpc_conn_trace {
- rxrpc_conn_new_client,
- rxrpc_conn_new_service,
- rxrpc_conn_queued,
- rxrpc_conn_seen,
- rxrpc_conn_got,
- rxrpc_conn_put_client,
- rxrpc_conn_put_service,
- rxrpc_conn__nr_trace
-};
-
-extern const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4];
-
-enum rxrpc_client_trace {
- rxrpc_client_activate_chans,
- rxrpc_client_alloc,
- rxrpc_client_chan_activate,
- rxrpc_client_chan_disconnect,
- rxrpc_client_chan_pass,
- rxrpc_client_chan_unstarted,
- rxrpc_client_cleanup,
- rxrpc_client_count,
- rxrpc_client_discard,
- rxrpc_client_duplicate,
- rxrpc_client_exposed,
- rxrpc_client_replace,
- rxrpc_client_to_active,
- rxrpc_client_to_culled,
- rxrpc_client_to_idle,
- rxrpc_client_to_inactive,
- rxrpc_client_to_waiting,
- rxrpc_client_uncount,
- rxrpc_client__nr_trace
-};
-
-extern const char rxrpc_client_traces[rxrpc_client__nr_trace][7];
-extern const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5];
-
-enum rxrpc_call_trace {
- rxrpc_call_new_client,
- rxrpc_call_new_service,
- rxrpc_call_queued,
- rxrpc_call_queued_ref,
- rxrpc_call_seen,
- rxrpc_call_connected,
- rxrpc_call_release,
- rxrpc_call_got,
- rxrpc_call_got_userid,
- rxrpc_call_got_kernel,
- rxrpc_call_put,
- rxrpc_call_put_userid,
- rxrpc_call_put_kernel,
- rxrpc_call_put_noqueue,
- rxrpc_call_error,
- rxrpc_call__nr_trace
-};
-
-extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4];
-
-enum rxrpc_transmit_trace {
- rxrpc_transmit_wait,
- rxrpc_transmit_queue,
- rxrpc_transmit_queue_last,
- rxrpc_transmit_rotate,
- rxrpc_transmit_rotate_last,
- rxrpc_transmit_await_reply,
- rxrpc_transmit_end,
- rxrpc_transmit__nr_trace
-};
-
-extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4];
-
-enum rxrpc_receive_trace {
- rxrpc_receive_incoming,
- rxrpc_receive_queue,
- rxrpc_receive_queue_last,
- rxrpc_receive_front,
- rxrpc_receive_rotate,
- rxrpc_receive_end,
- rxrpc_receive__nr_trace
-};
-
-extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4];
-
-enum rxrpc_recvmsg_trace {
- rxrpc_recvmsg_enter,
- rxrpc_recvmsg_wait,
- rxrpc_recvmsg_dequeue,
- rxrpc_recvmsg_hole,
- rxrpc_recvmsg_next,
- rxrpc_recvmsg_cont,
- rxrpc_recvmsg_full,
- rxrpc_recvmsg_data_return,
- rxrpc_recvmsg_terminal,
- rxrpc_recvmsg_to_be_accepted,
- rxrpc_recvmsg_return,
- rxrpc_recvmsg__nr_trace
-};
-
-extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5];
-
-enum rxrpc_rtt_tx_trace {
- rxrpc_rtt_tx_ping,
- rxrpc_rtt_tx_data,
- rxrpc_rtt_tx__nr_trace
-};
-
-extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5];
-
-enum rxrpc_rtt_rx_trace {
- rxrpc_rtt_rx_ping_response,
- rxrpc_rtt_rx_requested_ack,
- rxrpc_rtt_rx__nr_trace
-};
-
-extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5];
-
-enum rxrpc_timer_trace {
- rxrpc_timer_begin,
- rxrpc_timer_init_for_reply,
- rxrpc_timer_init_for_send_reply,
- rxrpc_timer_expired,
- rxrpc_timer_set_for_ack,
- rxrpc_timer_set_for_ping,
- rxrpc_timer_set_for_resend,
- rxrpc_timer_set_for_send,
- rxrpc_timer__nr_trace
-};
-
-extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8];
-
-enum rxrpc_propose_ack_trace {
- rxrpc_propose_ack_client_tx_end,
- rxrpc_propose_ack_input_data,
- rxrpc_propose_ack_ping_for_lost_ack,
- rxrpc_propose_ack_ping_for_lost_reply,
- rxrpc_propose_ack_ping_for_params,
- rxrpc_propose_ack_processing_op,
- rxrpc_propose_ack_respond_to_ack,
- rxrpc_propose_ack_respond_to_ping,
- rxrpc_propose_ack_retry_tx,
- rxrpc_propose_ack_rotate_rx,
- rxrpc_propose_ack_terminal_ack,
- rxrpc_propose_ack__nr_trace
-};
-
-enum rxrpc_propose_ack_outcome {
- rxrpc_propose_ack_use,
- rxrpc_propose_ack_update,
- rxrpc_propose_ack_subsume,
- rxrpc_propose_ack__nr_outcomes
-};
-
-extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8];
-extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes];
-
-enum rxrpc_congest_change {
- rxrpc_cong_begin_retransmission,
- rxrpc_cong_cleared_nacks,
- rxrpc_cong_new_low_nack,
- rxrpc_cong_no_change,
- rxrpc_cong_progress,
- rxrpc_cong_retransmit_again,
- rxrpc_cong_rtt_window_end,
- rxrpc_cong_saw_nack,
- rxrpc_congest__nr_change
-};
-
-extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10];
-extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9];
-
-extern const char *const rxrpc_pkts[];
-extern const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4];
-
#include <trace/events/rxrpc.h>
/*
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 832d854c2d5c..0ed181f53f32 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -323,6 +323,8 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
*
* If we want to report an error, we mark the skb with the packet type and
* abort code and return NULL.
+ *
+ * The call is returned with the user access mutex held.
*/
struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
struct rxrpc_connection *conn,
@@ -349,7 +351,8 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
found_service:
spin_lock(&rx->incoming_lock);
- if (rx->sk.sk_state == RXRPC_CLOSE) {
+ if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED ||
+ rx->sk.sk_state == RXRPC_CLOSE) {
trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber,
sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN);
skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
@@ -370,6 +373,18 @@ found_service:
trace_rxrpc_receive(call, rxrpc_receive_incoming,
sp->hdr.serial, sp->hdr.seq);
+ /* Lock the call to prevent rxrpc_kernel_send/recv_data() and
+ * sendmsg()/recvmsg() inconveniently stealing the mutex once the
+ * notification is generated.
+ *
+ * The BUG should never happen because the kernel should be well
+ * behaved enough not to access the call before the first notification
+ * event and userspace is prevented from doing so until the state is
+ * appropriate.
+ */
+ if (!mutex_trylock(&call->user_mutex))
+ BUG();
+
/* Make the call live. */
rxrpc_incoming_call(rx, call, skb);
conn = call->conn;
@@ -428,10 +443,12 @@ out:
/*
* handle acceptance of a call by userspace
* - assign the user call ID to the call at the front of the queue
+ * - called with the socket locked.
*/
struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
unsigned long user_call_ID,
rxrpc_notify_rx_t notify_rx)
+ __releases(&rx->sk.sk_lock.slock)
{
struct rxrpc_call *call;
struct rb_node *parent, **pp;
@@ -445,6 +462,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
if (list_empty(&rx->to_be_accepted)) {
write_unlock(&rx->call_lock);
+ release_sock(&rx->sk);
kleave(" = -ENODATA [empty]");
return ERR_PTR(-ENODATA);
}
@@ -469,10 +487,39 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
*/
call = list_entry(rx->to_be_accepted.next,
struct rxrpc_call, accept_link);
+ write_unlock(&rx->call_lock);
+
+ /* We need to gain the mutex from the interrupt handler without
+ * upsetting lockdep, so we have to release it there and take it here.
+ * We are, however, still holding the socket lock, so other accepts
+ * must wait for us and no one can add the user ID behind our backs.
+ */
+ if (mutex_lock_interruptible(&call->user_mutex) < 0) {
+ release_sock(&rx->sk);
+ kleave(" = -ERESTARTSYS");
+ return ERR_PTR(-ERESTARTSYS);
+ }
+
+ write_lock(&rx->call_lock);
list_del_init(&call->accept_link);
sk_acceptq_removed(&rx->sk);
rxrpc_see_call(call);
+ /* Find the user ID insertion point. */
+ pp = &rx->calls.rb_node;
+ parent = NULL;
+ while (*pp) {
+ parent = *pp;
+ call = rb_entry(parent, struct rxrpc_call, sock_node);
+
+ if (user_call_ID < call->user_call_ID)
+ pp = &(*pp)->rb_left;
+ else if (user_call_ID > call->user_call_ID)
+ pp = &(*pp)->rb_right;
+ else
+ BUG();
+ }
+
write_lock_bh(&call->state_lock);
switch (call->state) {
case RXRPC_CALL_SERVER_ACCEPTING:
@@ -498,6 +545,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
write_unlock(&rx->call_lock);
rxrpc_notify_socket(call);
rxrpc_service_prealloc(rx, GFP_KERNEL);
+ release_sock(&rx->sk);
_leave(" = %p{%d}", call, call->debug_id);
return call;
@@ -514,6 +562,7 @@ id_in_use:
write_unlock(&rx->call_lock);
out:
rxrpc_service_prealloc(rx, GFP_KERNEL);
+ release_sock(&rx->sk);
_leave(" = %d", ret);
return ERR_PTR(ret);
}
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 1ed18d8c9c9f..d79cd36987a9 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -43,24 +43,6 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
[RXRPC_CALL_NETWORK_ERROR] = "NetError",
};
-const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = {
- [rxrpc_call_new_client] = "NWc",
- [rxrpc_call_new_service] = "NWs",
- [rxrpc_call_queued] = "QUE",
- [rxrpc_call_queued_ref] = "QUR",
- [rxrpc_call_connected] = "CON",
- [rxrpc_call_release] = "RLS",
- [rxrpc_call_seen] = "SEE",
- [rxrpc_call_got] = "GOT",
- [rxrpc_call_got_userid] = "Gus",
- [rxrpc_call_got_kernel] = "Gke",
- [rxrpc_call_put] = "PUT",
- [rxrpc_call_put_userid] = "Pus",
- [rxrpc_call_put_kernel] = "Pke",
- [rxrpc_call_put_noqueue] = "PNQ",
- [rxrpc_call_error] = "*E*",
-};
-
struct kmem_cache *rxrpc_call_jar;
LIST_HEAD(rxrpc_calls);
DEFINE_RWLOCK(rxrpc_call_lock);
@@ -133,6 +115,7 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
if (!call->rxtx_annotations)
goto nomem_2;
+ mutex_init(&call->user_mutex);
setup_timer(&call->timer, rxrpc_call_timer_expired,
(unsigned long)call);
INIT_WORK(&call->processor, &rxrpc_process_call);
@@ -212,14 +195,16 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call)
}
/*
- * set up a call for the given data
- * - called in process context with IRQs enabled
+ * Set up a call for the given parameters.
+ * - Called with the socket lock held, which it must release.
+ * - If it returns a call, the call's lock will need releasing by the caller.
*/
struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
struct rxrpc_conn_parameters *cp,
struct sockaddr_rxrpc *srx,
unsigned long user_call_ID,
gfp_t gfp)
+ __releases(&rx->sk.sk_lock.slock)
{
struct rxrpc_call *call, *xcall;
struct rb_node *parent, **pp;
@@ -230,6 +215,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
call = rxrpc_alloc_client_call(srx, gfp);
if (IS_ERR(call)) {
+ release_sock(&rx->sk);
_leave(" = %ld", PTR_ERR(call));
return call;
}
@@ -237,6 +223,11 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage),
here, (const void *)user_call_ID);
+ /* We need to protect a partially set up call against the user as we
+ * will be acting outside the socket lock.
+ */
+ mutex_lock(&call->user_mutex);
+
/* Publish the call, even though it is incompletely set up as yet */
write_lock(&rx->call_lock);
@@ -268,6 +259,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
list_add_tail(&call->link, &rxrpc_calls);
write_unlock(&rxrpc_call_lock);
+ /* From this point on, the call is protected by its own lock. */
+ release_sock(&rx->sk);
+
/* Set up or get a connection record and set the protocol parameters,
* including channel number and call ID.
*/
@@ -297,6 +291,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
*/
error_dup_user_ID:
write_unlock(&rx->call_lock);
+ release_sock(&rx->sk);
ret = -EEXIST;
error:
@@ -305,6 +300,7 @@ error:
trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage),
here, ERR_PTR(ret));
rxrpc_release_call(rx, call);
+ mutex_unlock(&call->user_mutex);
rxrpc_put_call(call, rxrpc_call_put);
_leave(" = %d", ret);
return ERR_PTR(ret);
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 60ef9605167e..c3be03e8d098 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -76,6 +76,8 @@
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/timer.h>
+#include <linux/sched/signal.h>
+
#include "ar-internal.h"
__read_mostly unsigned int rxrpc_max_client_connections = 1000;
@@ -105,14 +107,6 @@ static void rxrpc_discard_expired_client_conns(struct work_struct *);
static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap,
rxrpc_discard_expired_client_conns);
-const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5] = {
- [RXRPC_CONN_CLIENT_INACTIVE] = "Inac",
- [RXRPC_CONN_CLIENT_WAITING] = "Wait",
- [RXRPC_CONN_CLIENT_ACTIVE] = "Actv",
- [RXRPC_CONN_CLIENT_CULLED] = "Cull",
- [RXRPC_CONN_CLIENT_IDLE] = "Idle",
-};
-
/*
* Get a connection ID and epoch for a client connection from the global pool.
* The connection struct pointer is then recorded in the idr radix tree. The
@@ -263,12 +257,12 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn)
* times the maximum number of client conns away from the current
* allocation point to try and keep the IDs concentrated.
*/
- id_cursor = READ_ONCE(rxrpc_client_conn_ids.cur);
+ id_cursor = idr_get_cursor(&rxrpc_client_conn_ids);
id = conn->proto.cid >> RXRPC_CIDSHIFT;
distance = id - id_cursor;
if (distance < 0)
distance = -distance;
- limit = round_up(rxrpc_max_client_connections, IDR_SIZE) * 4;
+ limit = max(rxrpc_max_client_connections * 4, 1024U);
if (distance > limit)
goto mark_dont_reuse;
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 3f9d8d7ec632..b099b64366f3 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -275,6 +275,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
rxrpc_conn_retransmit_call(conn, skb);
return 0;
+ case RXRPC_PACKET_TYPE_BUSY:
+ /* Just ignore BUSY packets for now. */
+ return 0;
+
case RXRPC_PACKET_TYPE_ABORT:
if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
&wtmp, sizeof(wtmp)) < 0)
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index e1e83af47866..b0ecb770fdce 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -173,6 +173,7 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn,
/* Save the result of the call so that we can repeat it if necessary
* through the channel, whilst disposing of the actual call record.
*/
+ trace_rxrpc_disconnect_call(call);
chan->last_service_id = call->service_id;
if (call->abort_code) {
chan->last_abort = call->abort_code;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 44fb8d893c7d..18b2ad8be8e2 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -420,6 +420,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
u16 skew)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ enum rxrpc_call_state state;
unsigned int offset = sizeof(struct rxrpc_wire_header);
unsigned int ix;
rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0;
@@ -434,14 +435,15 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
_proto("Rx DATA %%%u { #%u f=%02x }",
sp->hdr.serial, seq, sp->hdr.flags);
- if (call->state >= RXRPC_CALL_COMPLETE)
+ state = READ_ONCE(call->state);
+ if (state >= RXRPC_CALL_COMPLETE)
return;
/* Received data implicitly ACKs all of the request packets we sent
* when we're acting as a client.
*/
- if ((call->state == RXRPC_CALL_CLIENT_SEND_REQUEST ||
- call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) &&
+ if ((state == RXRPC_CALL_CLIENT_SEND_REQUEST ||
+ state == RXRPC_CALL_CLIENT_AWAIT_REPLY) &&
!rxrpc_receiving_reply(call))
return;
@@ -481,6 +483,7 @@ next_subpacket:
return rxrpc_proto_abort("LSA", call, seq);
}
+ trace_rxrpc_rx_data(call, seq, serial, flags, annotation);
if (before_eq(seq, hard_ack)) {
ack = RXRPC_ACK_DUPLICATE;
ack_serial = serial;
@@ -649,6 +652,7 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxrpc_peer *peer;
unsigned int mtu;
+ bool wake = false;
u32 rwind = ntohl(ackinfo->rwind);
_proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
@@ -656,9 +660,14 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU),
rwind, ntohl(ackinfo->jumbo_max));
- if (rwind > RXRPC_RXTX_BUFF_SIZE - 1)
- rwind = RXRPC_RXTX_BUFF_SIZE - 1;
- call->tx_winsize = rwind;
+ if (call->tx_winsize != rwind) {
+ if (rwind > RXRPC_RXTX_BUFF_SIZE - 1)
+ rwind = RXRPC_RXTX_BUFF_SIZE - 1;
+ if (rwind > call->tx_winsize)
+ wake = true;
+ call->tx_winsize = rwind;
+ }
+
if (call->cong_ssthresh > rwind)
call->cong_ssthresh = rwind;
@@ -672,6 +681,9 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
spin_unlock_bh(&peer->lock);
_net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata);
}
+
+ if (wake)
+ wake_up(&call->waitq);
}
/*
@@ -765,16 +777,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ?
buf.ack.reason : RXRPC_ACK__INVALID);
- trace_rxrpc_rx_ack(call, first_soft_ack, summary.ack_reason, nr_acks);
-
- _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
- sp->hdr.serial,
- ntohs(buf.ack.maxSkew),
- first_soft_ack,
- ntohl(buf.ack.previousPacket),
- acked_serial,
- rxrpc_ack_names[summary.ack_reason],
- buf.ack.nAcks);
+ trace_rxrpc_rx_ack(call, sp->hdr.serial, acked_serial,
+ first_soft_ack, ntohl(buf.ack.previousPacket),
+ summary.ack_reason, nr_acks);
if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
rxrpc_input_ping_response(call, skb->tstamp, acked_serial,
@@ -805,7 +810,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
return rxrpc_proto_abort("AK0", call, 0);
/* Ignore ACKs unless we are or have just been transmitting. */
- switch (call->state) {
+ switch (READ_ONCE(call->state)) {
case RXRPC_CALL_CLIENT_SEND_REQUEST:
case RXRPC_CALL_CLIENT_AWAIT_REPLY:
case RXRPC_CALL_SERVER_SEND_REPLY:
@@ -931,7 +936,6 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
break;
default:
- _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
break;
}
@@ -947,7 +951,7 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn,
struct rxrpc_call *call)
{
- switch (call->state) {
+ switch (READ_ONCE(call->state)) {
case RXRPC_CALL_SERVER_AWAIT_ACK:
rxrpc_call_completed(call);
break;
@@ -961,6 +965,7 @@ static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn,
break;
}
+ trace_rxrpc_improper_term(call);
__rxrpc_disconnect_call(conn, call);
rxrpc_notify_socket(call);
}
@@ -1053,7 +1058,7 @@ void rxrpc_data_ready(struct sock *udp_sk)
ASSERT(!irqs_disabled());
- skb = skb_recv_datagram(udp_sk, 0, 1, &ret);
+ skb = skb_recv_udp(udp_sk, 0, 1, &ret);
if (!skb) {
if (ret == -EAGAIN)
return;
@@ -1075,10 +1080,9 @@ void rxrpc_data_ready(struct sock *udp_sk)
__UDP_INC_STATS(&init_net, UDP_MIB_INDATAGRAMS, 0);
- /* The socket buffer we have is owned by UDP, with UDP's data all over
- * it, but we really want our own data there.
+ /* The UDP protocol already released all skb resources;
+ * we are free to add our own data there.
*/
- skb_orphan(skb);
sp = rxrpc_skb(skb);
/* dig out the RxRPC connection details */
@@ -1201,6 +1205,7 @@ void rxrpc_data_ready(struct sock *udp_sk)
goto reject_packet;
}
rxrpc_send_ping(call, skb, skew);
+ mutex_unlock(&call->user_mutex);
}
rxrpc_input_call_packet(call, skb, skew);
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
index 18c737a61d80..0a4e28477ad9 100644
--- a/net/rxrpc/key.c
+++ b/net/rxrpc/key.c
@@ -1065,7 +1065,7 @@ static long rxrpc_read(const struct key *key,
switch (token->security_index) {
case RXRPC_SECURITY_RXKAD:
- toksize += 8 * 4; /* viceid, kvno, key*2, begin,
+ toksize += 9 * 4; /* viceid, kvno, key*2 + len, begin,
* end, primary, tktlen */
toksize += RND(token->kad->ticket_len);
break;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 6dee55fad2d3..1a2d4b112064 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -77,12 +77,6 @@ unsigned int rxrpc_rx_jumbo_max = 4;
*/
unsigned int rxrpc_resend_timeout = 4 * 1000;
-const char *const rxrpc_pkts[] = {
- "?00",
- "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG",
- "?09", "?10", "?11", "?12", "VERSION", "?14", "?15"
-};
-
const s8 rxrpc_ack_priority[] = {
[0] = 0,
[RXRPC_ACK_DELAY] = 1,
@@ -94,148 +88,3 @@ const s8 rxrpc_ack_priority[] = {
[RXRPC_ACK_NOSPACE] = 7,
[RXRPC_ACK_PING_RESPONSE] = 8,
};
-
-const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = {
- "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
- "IDL", "-?-"
-};
-
-const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = {
- [rxrpc_skb_rx_cleaned] = "Rx CLN",
- [rxrpc_skb_rx_freed] = "Rx FRE",
- [rxrpc_skb_rx_got] = "Rx GOT",
- [rxrpc_skb_rx_lost] = "Rx *L*",
- [rxrpc_skb_rx_received] = "Rx RCV",
- [rxrpc_skb_rx_purged] = "Rx PUR",
- [rxrpc_skb_rx_rotated] = "Rx ROT",
- [rxrpc_skb_rx_seen] = "Rx SEE",
- [rxrpc_skb_tx_cleaned] = "Tx CLN",
- [rxrpc_skb_tx_freed] = "Tx FRE",
- [rxrpc_skb_tx_got] = "Tx GOT",
- [rxrpc_skb_tx_new] = "Tx NEW",
- [rxrpc_skb_tx_rotated] = "Tx ROT",
- [rxrpc_skb_tx_seen] = "Tx SEE",
-};
-
-const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = {
- [rxrpc_conn_new_client] = "NWc",
- [rxrpc_conn_new_service] = "NWs",
- [rxrpc_conn_queued] = "QUE",
- [rxrpc_conn_seen] = "SEE",
- [rxrpc_conn_got] = "GOT",
- [rxrpc_conn_put_client] = "PTc",
- [rxrpc_conn_put_service] = "PTs",
-};
-
-const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = {
- [rxrpc_client_activate_chans] = "Activa",
- [rxrpc_client_alloc] = "Alloc ",
- [rxrpc_client_chan_activate] = "ChActv",
- [rxrpc_client_chan_disconnect] = "ChDisc",
- [rxrpc_client_chan_pass] = "ChPass",
- [rxrpc_client_chan_unstarted] = "ChUnst",
- [rxrpc_client_cleanup] = "Clean ",
- [rxrpc_client_count] = "Count ",
- [rxrpc_client_discard] = "Discar",
- [rxrpc_client_duplicate] = "Duplic",
- [rxrpc_client_exposed] = "Expose",
- [rxrpc_client_replace] = "Replac",
- [rxrpc_client_to_active] = "->Actv",
- [rxrpc_client_to_culled] = "->Cull",
- [rxrpc_client_to_idle] = "->Idle",
- [rxrpc_client_to_inactive] = "->Inac",
- [rxrpc_client_to_waiting] = "->Wait",
- [rxrpc_client_uncount] = "Uncoun",
-};
-
-const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = {
- [rxrpc_transmit_wait] = "WAI",
- [rxrpc_transmit_queue] = "QUE",
- [rxrpc_transmit_queue_last] = "QLS",
- [rxrpc_transmit_rotate] = "ROT",
- [rxrpc_transmit_rotate_last] = "RLS",
- [rxrpc_transmit_await_reply] = "AWR",
- [rxrpc_transmit_end] = "END",
-};
-
-const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = {
- [rxrpc_receive_incoming] = "INC",
- [rxrpc_receive_queue] = "QUE",
- [rxrpc_receive_queue_last] = "QLS",
- [rxrpc_receive_front] = "FRN",
- [rxrpc_receive_rotate] = "ROT",
- [rxrpc_receive_end] = "END",
-};
-
-const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = {
- [rxrpc_recvmsg_enter] = "ENTR",
- [rxrpc_recvmsg_wait] = "WAIT",
- [rxrpc_recvmsg_dequeue] = "DEQU",
- [rxrpc_recvmsg_hole] = "HOLE",
- [rxrpc_recvmsg_next] = "NEXT",
- [rxrpc_recvmsg_cont] = "CONT",
- [rxrpc_recvmsg_full] = "FULL",
- [rxrpc_recvmsg_data_return] = "DATA",
- [rxrpc_recvmsg_terminal] = "TERM",
- [rxrpc_recvmsg_to_be_accepted] = "TBAC",
- [rxrpc_recvmsg_return] = "RETN",
-};
-
-const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = {
- [rxrpc_rtt_tx_ping] = "PING",
- [rxrpc_rtt_tx_data] = "DATA",
-};
-
-const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = {
- [rxrpc_rtt_rx_ping_response] = "PONG",
- [rxrpc_rtt_rx_requested_ack] = "RACK",
-};
-
-const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
- [rxrpc_timer_begin] = "Begin ",
- [rxrpc_timer_expired] = "*EXPR*",
- [rxrpc_timer_init_for_reply] = "IniRpl",
- [rxrpc_timer_init_for_send_reply] = "SndRpl",
- [rxrpc_timer_set_for_ack] = "SetAck",
- [rxrpc_timer_set_for_ping] = "SetPng",
- [rxrpc_timer_set_for_send] = "SetTx ",
- [rxrpc_timer_set_for_resend] = "SetRTx",
-};
-
-const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = {
- [rxrpc_propose_ack_client_tx_end] = "ClTxEnd",
- [rxrpc_propose_ack_input_data] = "DataIn ",
- [rxrpc_propose_ack_ping_for_lost_ack] = "LostAck",
- [rxrpc_propose_ack_ping_for_lost_reply] = "LostRpl",
- [rxrpc_propose_ack_ping_for_params] = "Params ",
- [rxrpc_propose_ack_processing_op] = "ProcOp ",
- [rxrpc_propose_ack_respond_to_ack] = "Rsp2Ack",
- [rxrpc_propose_ack_respond_to_ping] = "Rsp2Png",
- [rxrpc_propose_ack_retry_tx] = "RetryTx",
- [rxrpc_propose_ack_rotate_rx] = "RxAck ",
- [rxrpc_propose_ack_terminal_ack] = "ClTerm ",
-};
-
-const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = {
- [rxrpc_propose_ack_use] = "",
- [rxrpc_propose_ack_update] = " Update",
- [rxrpc_propose_ack_subsume] = " Subsume",
-};
-
-const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10] = {
- [RXRPC_CALL_SLOW_START] = "SlowStart",
- [RXRPC_CALL_CONGEST_AVOIDANCE] = "CongAvoid",
- [RXRPC_CALL_PACKET_LOSS] = "PktLoss ",
- [RXRPC_CALL_FAST_RETRANSMIT] = "FastReTx ",
-};
-
-const char rxrpc_congest_changes[rxrpc_congest__nr_change][9] = {
- [rxrpc_cong_begin_retransmission] = " Retrans",
- [rxrpc_cong_cleared_nacks] = " Cleared",
- [rxrpc_cong_new_low_nack] = " NewLowN",
- [rxrpc_cong_no_change] = "",
- [rxrpc_cong_progress] = " Progres",
- [rxrpc_cong_retransmit_again] = " ReTxAgn",
- [rxrpc_cong_rtt_window_end] = " RttWinE",
- [rxrpc_cong_saw_nack] = " SawNack",
-};
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 65cd980767fa..b9bcfbfb095c 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -52,6 +52,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
struct rxrpc_sock *rx;
struct rxrpc_peer *peer;
struct rxrpc_call *call;
+ rxrpc_seq_t tx_hard_ack, rx_hard_ack;
char lbuff[50], rbuff[50];
if (v == &rxrpc_calls) {
@@ -82,9 +83,11 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
else
strcpy(rbuff, "no_connection");
+ tx_hard_ack = READ_ONCE(call->tx_hard_ack);
+ rx_hard_ack = READ_ONCE(call->rx_hard_ack);
seq_printf(seq,
"UDP %-47.47s %-47.47s %4x %08x %08x %s %3u"
- " %-8.8s %08x %lx\n",
+ " %-8.8s %08x %lx %08x %02x %08x %02x\n",
lbuff,
rbuff,
call->service_id,
@@ -94,7 +97,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
atomic_read(&call->usage),
rxrpc_call_states[call->state],
call->abort_code,
- call->user_call_ID);
+ call->user_call_ID,
+ tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack,
+ rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack);
return 0;
}
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index c29362d50a92..3e2f1a8e9c5b 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -14,6 +14,8 @@
#include <linux/net.h>
#include <linux/skbuff.h>
#include <linux/export.h>
+#include <linux/sched/signal.h>
+
#include <net/sock.h>
#include <net/af_rxrpc.h>
#include "ar-internal.h"
@@ -320,8 +322,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
/* Barriers against rxrpc_input_data(). */
hard_ack = call->rx_hard_ack;
- top = smp_load_acquire(&call->rx_top);
- for (seq = hard_ack + 1; before_eq(seq, top); seq++) {
+ seq = hard_ack + 1;
+ while (top = smp_load_acquire(&call->rx_top),
+ before_eq(seq, top)
+ ) {
ix = seq & RXRPC_RXTX_BUFF_MASK;
skb = call->rxtx_buffer[ix];
if (!skb) {
@@ -394,6 +398,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
ret = 1;
goto out;
}
+
+ seq++;
}
out:
@@ -483,6 +489,20 @@ try_again:
trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0);
+ /* We're going to drop the socket lock, so we need to lock the call
+ * against interference by sendmsg.
+ */
+ if (!mutex_trylock(&call->user_mutex)) {
+ ret = -EWOULDBLOCK;
+ if (flags & MSG_DONTWAIT)
+ goto error_requeue_call;
+ ret = -ERESTARTSYS;
+ if (mutex_lock_interruptible(&call->user_mutex) < 0)
+ goto error_requeue_call;
+ }
+
+ release_sock(&rx->sk);
+
if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
BUG();
@@ -498,7 +518,7 @@ try_again:
&call->user_call_ID);
}
if (ret < 0)
- goto error;
+ goto error_unlock_call;
}
if (msg->msg_name) {
@@ -507,7 +527,7 @@ try_again:
msg->msg_namelen = len;
}
- switch (call->state) {
+ switch (READ_ONCE(call->state)) {
case RXRPC_CALL_SERVER_ACCEPTING:
ret = rxrpc_recvmsg_new_call(rx, call, msg, flags);
break;
@@ -529,12 +549,12 @@ try_again:
}
if (ret < 0)
- goto error;
+ goto error_unlock_call;
if (call->state == RXRPC_CALL_COMPLETE) {
ret = rxrpc_recvmsg_term(call, msg);
if (ret < 0)
- goto error;
+ goto error_unlock_call;
if (!(flags & MSG_PEEK))
rxrpc_release_call(rx, call);
msg->msg_flags |= MSG_EOR;
@@ -547,8 +567,21 @@ try_again:
msg->msg_flags &= ~MSG_MORE;
ret = copied;
-error:
+error_unlock_call:
+ mutex_unlock(&call->user_mutex);
rxrpc_put_call(call, rxrpc_call_put);
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret);
+ return ret;
+
+error_requeue_call:
+ if (!(flags & MSG_PEEK)) {
+ write_lock_bh(&rx->recvmsg_lock);
+ list_add(&call->recvmsg_link, &rx->recvmsg_q);
+ write_unlock_bh(&rx->recvmsg_lock);
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_requeue, 0, 0, 0, 0);
+ } else {
+ rxrpc_put_call(call, rxrpc_call_put);
+ }
error_no_call:
release_sock(&rx->sk);
trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret);
@@ -605,9 +638,9 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
iov.iov_len = size - *_offset;
iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset);
- lock_sock(sock->sk);
+ mutex_lock(&call->user_mutex);
- switch (call->state) {
+ switch (READ_ONCE(call->state)) {
case RXRPC_CALL_CLIENT_RECV_REPLY:
case RXRPC_CALL_SERVER_RECV_REQUEST:
case RXRPC_CALL_SERVER_ACK_REQUEST:
@@ -644,7 +677,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
read_phase_complete:
ret = 1;
out:
- release_sock(sock->sk);
+ mutex_unlock(&call->user_mutex);
_leave(" = %d [%zu,%d]", ret, *_offset, *_abort);
return ret;
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index b214a4d4a641..97ab214ca411 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -15,6 +15,8 @@
#include <linux/gfp.h>
#include <linux/skbuff.h>
#include <linux/export.h>
+#include <linux/sched/signal.h>
+
#include <net/sock.h>
#include <net/af_rxrpc.h>
#include "ar-internal.h"
@@ -59,9 +61,12 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
}
trace_rxrpc_transmit(call, rxrpc_transmit_wait);
- release_sock(&rx->sk);
+ mutex_unlock(&call->user_mutex);
*timeo = schedule_timeout(*timeo);
- lock_sock(&rx->sk);
+ if (mutex_lock_interruptible(&call->user_mutex) < 0) {
+ ret = sock_intr_errno(*timeo);
+ break;
+ }
}
remove_wait_queue(&call->waitq, &myself);
@@ -171,7 +176,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
/*
* send data through a socket
* - must be called in process context
- * - caller holds the socket locked
+ * - The caller holds the call user access mutex, but not the socket lock.
*/
static int rxrpc_send_data(struct rxrpc_sock *rx,
struct rxrpc_call *call,
@@ -376,7 +381,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
if (!CMSG_OK(msg, cmsg))
return -EINVAL;
- len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+ len = cmsg->cmsg_len - sizeof(struct cmsghdr);
_debug("CMSG %d, %d, %d",
cmsg->cmsg_level, cmsg->cmsg_type, len);
@@ -437,10 +442,13 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
/*
* Create a new client call for sendmsg().
+ * - Called with the socket lock held, which it must release.
+ * - If it returns a call, the call's lock will need releasing by the caller.
*/
static struct rxrpc_call *
rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
unsigned long user_call_ID, bool exclusive)
+ __releases(&rx->sk.sk_lock.slock)
{
struct rxrpc_conn_parameters cp;
struct rxrpc_call *call;
@@ -450,8 +458,10 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
_enter("");
- if (!msg->msg_name)
+ if (!msg->msg_name) {
+ release_sock(&rx->sk);
return ERR_PTR(-EDESTADDRREQ);
+ }
key = rx->key;
if (key && !rx->key->payload.data[0])
@@ -464,6 +474,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
cp.exclusive = rx->exclusive | exclusive;
cp.service_id = srx->srx_service;
call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL);
+ /* The socket is now unlocked */
_leave(" = %p\n", call);
return call;
@@ -475,7 +486,9 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
* - the socket may be either a client socket or a server socket
*/
int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
+ __releases(&rx->sk.sk_lock.slock)
{
+ enum rxrpc_call_state state;
enum rxrpc_command cmd;
struct rxrpc_call *call;
unsigned long user_call_ID = 0;
@@ -488,12 +501,14 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code,
&exclusive);
if (ret < 0)
- return ret;
+ goto error_release_sock;
if (cmd == RXRPC_CMD_ACCEPT) {
+ ret = -EINVAL;
if (rx->sk.sk_state != RXRPC_SERVER_LISTENING)
- return -EINVAL;
+ goto error_release_sock;
call = rxrpc_accept_call(rx, user_call_ID, NULL);
+ /* The socket is now unlocked. */
if (IS_ERR(call))
return PTR_ERR(call);
rxrpc_put_call(call, rxrpc_call_put);
@@ -502,18 +517,41 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
call = rxrpc_find_call_by_user_ID(rx, user_call_ID);
if (!call) {
+ ret = -EBADSLT;
if (cmd != RXRPC_CMD_SEND_DATA)
- return -EBADSLT;
+ goto error_release_sock;
call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID,
exclusive);
+ /* The socket is now unlocked... */
if (IS_ERR(call))
return PTR_ERR(call);
+ /* ... and we have the call lock. */
+ } else {
+ switch (READ_ONCE(call->state)) {
+ case RXRPC_CALL_UNINITIALISED:
+ case RXRPC_CALL_CLIENT_AWAIT_CONN:
+ case RXRPC_CALL_SERVER_PREALLOC:
+ case RXRPC_CALL_SERVER_SECURING:
+ case RXRPC_CALL_SERVER_ACCEPTING:
+ ret = -EBUSY;
+ goto error_release_sock;
+ default:
+ break;
+ }
+
+ ret = mutex_lock_interruptible(&call->user_mutex);
+ release_sock(&rx->sk);
+ if (ret < 0) {
+ ret = -ERESTARTSYS;
+ goto error_put;
+ }
}
+ state = READ_ONCE(call->state);
_debug("CALL %d USR %lx ST %d on CONN %p",
- call->debug_id, call->user_call_ID, call->state, call->conn);
+ call->debug_id, call->user_call_ID, state, call->conn);
- if (call->state >= RXRPC_CALL_COMPLETE) {
+ if (state >= RXRPC_CALL_COMPLETE) {
/* it's too late for this call */
ret = -ESHUTDOWN;
} else if (cmd == RXRPC_CMD_SEND_ABORT) {
@@ -523,21 +561,27 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
} else if (cmd != RXRPC_CMD_SEND_DATA) {
ret = -EINVAL;
} else if (rxrpc_is_client_call(call) &&
- call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) {
+ state != RXRPC_CALL_CLIENT_SEND_REQUEST) {
/* request phase complete for this client call */
ret = -EPROTO;
} else if (rxrpc_is_service_call(call) &&
- call->state != RXRPC_CALL_SERVER_ACK_REQUEST &&
- call->state != RXRPC_CALL_SERVER_SEND_REPLY) {
+ state != RXRPC_CALL_SERVER_ACK_REQUEST &&
+ state != RXRPC_CALL_SERVER_SEND_REPLY) {
/* Reply phase not begun or not complete for service call. */
ret = -EPROTO;
} else {
ret = rxrpc_send_data(rx, call, msg, len);
}
+ mutex_unlock(&call->user_mutex);
+error_put:
rxrpc_put_call(call, rxrpc_call_put);
_leave(" = %d", ret);
return ret;
+
+error_release_sock:
+ release_sock(&rx->sk);
+ return ret;
}
/**
@@ -562,22 +606,29 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call,
ASSERTCMP(msg->msg_name, ==, NULL);
ASSERTCMP(msg->msg_control, ==, NULL);
- lock_sock(sock->sk);
+ mutex_lock(&call->user_mutex);
_debug("CALL %d USR %lx ST %d on CONN %p",
call->debug_id, call->user_call_ID, call->state, call->conn);
- if (call->state >= RXRPC_CALL_COMPLETE) {
- ret = -ESHUTDOWN; /* it's too late for this call */
- } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST &&
- call->state != RXRPC_CALL_SERVER_ACK_REQUEST &&
- call->state != RXRPC_CALL_SERVER_SEND_REPLY) {
- ret = -EPROTO; /* request phase complete for this client call */
- } else {
+ switch (READ_ONCE(call->state)) {
+ case RXRPC_CALL_CLIENT_SEND_REQUEST:
+ case RXRPC_CALL_SERVER_ACK_REQUEST:
+ case RXRPC_CALL_SERVER_SEND_REPLY:
ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len);
+ break;
+ case RXRPC_CALL_COMPLETE:
+ read_lock_bh(&call->state_lock);
+ ret = -call->error;
+ read_unlock_bh(&call->state_lock);
+ break;
+ default:
+ /* Request phase complete for this client call */
+ ret = -EPROTO;
+ break;
}
- release_sock(sock->sk);
+ mutex_unlock(&call->user_mutex);
_leave(" = %d", ret);
return ret;
}
@@ -598,12 +649,12 @@ void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call,
{
_enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why);
- lock_sock(sock->sk);
+ mutex_lock(&call->user_mutex);
if (rxrpc_abort_call(why, call, 0, abort_code, error))
rxrpc_send_abort_packet(call);
- release_sock(sock->sk);
+ mutex_unlock(&call->user_mutex);
_leave("");
}
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 87956a768d1b..403790cce7d2 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -650,6 +650,18 @@ config NET_ACT_MIRRED
To compile this code as a module, choose M here: the
module will be called act_mirred.
+config NET_ACT_SAMPLE
+ tristate "Traffic Sampling"
+ depends on NET_CLS_ACT
+ select PSAMPLE
+ ---help---
+ Say Y here to allow packet sampling tc action. The packet sample
+ action consists of statistically choosing packets and sampling
+ them using the psample module.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_sample.
+
config NET_ACT_IPT
tristate "IPtables targets"
depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
@@ -707,6 +719,7 @@ config NET_ACT_SKBEDIT
config NET_ACT_CSUM
tristate "Checksum Updating"
depends on NET_CLS_ACT && INET
+ select LIBCRC32C
---help---
Say Y here to update some common checksum after some direct
packet alterations.
@@ -763,6 +776,7 @@ config NET_ACT_SKBMOD
config NET_ACT_IFE
tristate "Inter-FE action based on IETF ForCES InterFE LFB"
depends on NET_CLS_ACT
+ select NET_IFE
---help---
Say Y here to allow for sourcing and terminating metadata
For details refer to netdev01 paper:
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 4bdda3634e0b..7b915d226de7 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_NET_CLS_ACT) += act_api.o
obj-$(CONFIG_NET_ACT_POLICE) += act_police.o
obj-$(CONFIG_NET_ACT_GACT) += act_gact.o
obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o
+obj-$(CONFIG_NET_ACT_SAMPLE) += act_sample.o
obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o
obj-$(CONFIG_NET_ACT_NAT) += act_nat.o
obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index f893d180da1c..e05b924618a0 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -24,6 +24,7 @@
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/sch_generic.h>
+#include <net/pkt_cls.h>
#include <net/act_api.h>
#include <net/netlink.h>
@@ -33,6 +34,12 @@ static void free_tcf(struct rcu_head *head)
free_percpu(p->cpu_bstats);
free_percpu(p->cpu_qstats);
+
+ if (p->act_cookie) {
+ kfree(p->act_cookie->data);
+ kfree(p->act_cookie);
+ }
+
kfree(p);
}
@@ -41,8 +48,7 @@ static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *p)
spin_lock_bh(&hinfo->lock);
hlist_del(&p->tcfa_head);
spin_unlock_bh(&hinfo->lock);
- gen_kill_estimator(&p->tcfa_bstats,
- &p->tcfa_rate_est);
+ gen_kill_estimator(&p->tcfa_rate_est);
/*
* gen_estimator est_timer() might access p->tcfa_lock
* or bstats, wait a RCU grace period before freeing p
@@ -237,8 +243,7 @@ EXPORT_SYMBOL(tcf_hash_check);
void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
{
if (est)
- gen_kill_estimator(&a->tcfa_bstats,
- &a->tcfa_rate_est);
+ gen_kill_estimator(&a->tcfa_rate_est);
call_rcu(&a->tcfa_rcu, free_tcf);
}
EXPORT_SYMBOL(tcf_hash_cleanup);
@@ -428,11 +433,9 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
{
int ret = -1, i;
- if (skb->tc_verd & TC_NCLS) {
- skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
- ret = TC_ACT_OK;
- goto exec_done;
- }
+ if (skb_skip_tc_classify(skb))
+ return TC_ACT_OK;
+
for (i = 0; i < nr_actions; i++) {
const struct tc_action *a = actions[i];
@@ -441,9 +444,8 @@ repeat:
if (ret == TC_ACT_REPEAT)
goto repeat; /* we need a ttl - JHS */
if (ret != TC_ACT_PIPE)
- goto exec_done;
+ break;
}
-exec_done:
return ret;
}
EXPORT_SYMBOL(tcf_action_exec);
@@ -480,6 +482,12 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
goto nla_put_failure;
if (tcf_action_copy_stats(skb, a, 0))
goto nla_put_failure;
+ if (a->act_cookie) {
+ if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len,
+ a->act_cookie->data))
+ goto nla_put_failure;
+ }
+
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -521,12 +529,29 @@ errout:
return err;
}
+static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
+{
+ struct tc_cookie *c = kzalloc(sizeof(*c), GFP_KERNEL);
+ if (!c)
+ return NULL;
+
+ c->data = nla_memdup(tb[TCA_ACT_COOKIE], GFP_KERNEL);
+ if (!c->data) {
+ kfree(c);
+ return NULL;
+ }
+ c->len = nla_len(tb[TCA_ACT_COOKIE]);
+
+ return c;
+}
+
struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
struct nlattr *est, char *name, int ovr,
int bind)
{
struct tc_action *a;
struct tc_action_ops *a_o;
+ struct tc_cookie *cookie = NULL;
char act_name[IFNAMSIZ];
struct nlattr *tb[TCA_ACT_MAX + 1];
struct nlattr *kind;
@@ -542,6 +567,18 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
goto err_out;
if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ)
goto err_out;
+ if (tb[TCA_ACT_COOKIE]) {
+ int cklen = nla_len(tb[TCA_ACT_COOKIE]);
+
+ if (cklen > TC_COOKIE_MAX_SIZE)
+ goto err_out;
+
+ cookie = nla_memdup_cookie(tb);
+ if (!cookie) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ }
} else {
err = -EINVAL;
if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ)
@@ -580,6 +617,14 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
if (err < 0)
goto err_mod;
+ if (name == NULL && tb[TCA_ACT_COOKIE]) {
+ if (a->act_cookie) {
+ kfree(a->act_cookie->data);
+ kfree(a->act_cookie);
+ }
+ a->act_cookie = cookie;
+ }
+
/* module count goes up only when brand new policy is created
* if it exists and is only bound to in a_o->init() then
* ACT_P_CREATED is not returned (a zero is).
@@ -592,6 +637,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
err_mod:
module_put(a_o->owner);
err_out:
+ if (cookie) {
+ kfree(cookie->data);
+ kfree(cookie);
+ }
return ERR_PTR(err);
}
@@ -670,8 +719,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
goto errout;
if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
- gnet_stats_copy_rate_est(&d, &p->tcfa_bstats,
- &p->tcfa_rate_est) < 0 ||
+ gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
gnet_stats_copy_queue(&d, p->cpu_qstats,
&p->tcfa_qstats,
p->tcfa_qstats.qlen) < 0)
@@ -820,10 +868,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
goto out_module_put;
err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops);
- if (err < 0)
+ if (err <= 0)
goto out_module_put;
- if (err == 0)
- goto noflush_out;
nla_nest_end(skb, nest);
@@ -840,7 +886,6 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
out_module_put:
module_put(ops->owner);
err_out:
-noflush_out:
kfree_skb(skb);
return err;
}
@@ -903,8 +948,6 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
goto err;
}
act->order = i;
- if (event == RTM_GETACTION)
- act->tcfa_refcnt++;
list_add_tail(&act->list, &actions);
}
@@ -917,7 +960,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
return ret;
}
err:
- tcf_action_destroy(&actions, 0);
+ if (event != RTM_GETACTION)
+ tcf_action_destroy(&actions, 0);
return ret;
}
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 1d3960033f61..520baa41cba3 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -28,12 +28,11 @@ struct tcf_bpf_cfg {
struct bpf_prog *filter;
struct sock_filter *bpf_ops;
const char *bpf_name;
- u32 bpf_fd;
u16 bpf_num_ops;
bool is_ebpf;
};
-static int bpf_net_id;
+static unsigned int bpf_net_id;
static struct tc_action_ops act_bpf_ops;
static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
@@ -118,13 +117,18 @@ static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog,
static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog,
struct sk_buff *skb)
{
- if (nla_put_u32(skb, TCA_ACT_BPF_FD, prog->bpf_fd))
- return -EMSGSIZE;
+ struct nlattr *nla;
if (prog->bpf_name &&
nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name))
return -EMSGSIZE;
+ nla = nla_reserve(skb, TCA_ACT_BPF_TAG, sizeof(prog->filter->tag));
+ if (nla == NULL)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(nla), prog->filter->tag, nla_len(nla));
+
return 0;
}
@@ -226,16 +230,13 @@ static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
return PTR_ERR(fp);
if (tb[TCA_ACT_BPF_NAME]) {
- name = kmemdup(nla_data(tb[TCA_ACT_BPF_NAME]),
- nla_len(tb[TCA_ACT_BPF_NAME]),
- GFP_KERNEL);
+ name = nla_memdup(tb[TCA_ACT_BPF_NAME], GFP_KERNEL);
if (!name) {
bpf_prog_put(fp);
return -ENOMEM;
}
}
- cfg->bpf_fd = bpf_fd;
cfg->bpf_name = name;
cfg->filter = fp;
cfg->is_ebpf = true;
@@ -334,8 +335,6 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
if (cfg.bpf_num_ops)
prog->bpf_num_ops = cfg.bpf_num_ops;
- if (cfg.bpf_fd)
- prog->bpf_fd = cfg.bpf_fd;
prog->tcf_action = parm->action;
rcu_assign_pointer(prog->filter, cfg.filter);
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index eae07a2e774d..f9bb43c25697 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -30,7 +30,7 @@
#define CONNMARK_TAB_MASK 3
-static int connmark_net_id;
+static unsigned int connmark_net_id;
static struct tc_action_ops act_connmark_ops;
static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
@@ -113,6 +113,9 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
if (ret < 0)
return ret;
+ if (!tb[TCA_CONNMARK_PARMS])
+ return -EINVAL;
+
parm = nla_data(tb[TCA_CONNMARK_PARMS]);
if (!tcf_hash_check(tn, parm->index, a, bind)) {
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index e0defcef376d..e978ccd4402c 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -30,6 +30,7 @@
#include <net/tcp.h>
#include <net/udp.h>
#include <net/ip6_checksum.h>
+#include <net/sctp/checksum.h>
#include <net/act_api.h>
@@ -42,7 +43,7 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
[TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
};
-static int csum_net_id;
+static unsigned int csum_net_id;
static struct tc_action_ops act_csum_ops;
static int tcf_csum_init(struct net *net, struct nlattr *nla,
@@ -322,6 +323,25 @@ ignore_obscure_skb:
return 1;
}
+static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
+{
+ struct sctphdr *sctph;
+
+ if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_SCTP)
+ return 1;
+
+ sctph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*sctph));
+ if (!sctph)
+ return 0;
+
+ sctph->checksum = sctp_compute_cksum(skb,
+ skb_network_offset(skb) + ihl);
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return 1;
+}
+
static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
{
const struct iphdr *iph;
@@ -365,6 +385,11 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
ntohs(iph->tot_len), 1))
goto fail;
break;
+ case IPPROTO_SCTP:
+ if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) &&
+ !tcf_csum_sctp(skb, iph->ihl * 4, ntohs(iph->tot_len)))
+ goto fail;
+ break;
}
if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
@@ -481,6 +506,11 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
pl + sizeof(*ip6h), 1))
goto fail;
goto done;
+ case IPPROTO_SCTP:
+ if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) &&
+ !tcf_csum_sctp(skb, hl, pl + sizeof(*ip6h)))
+ goto fail;
+ goto done;
default:
goto ignore_skb;
}
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index e0aa30f83c6c..e6c874a2b283 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -25,7 +25,7 @@
#define GACT_TAB_MASK 15
-static int gact_net_id;
+static unsigned int gact_net_id;
static struct tc_action_ops act_gact_ops;
#ifdef CONFIG_GACT_PROB
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 95c463cbb9a6..71e7ff22f7c9 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -32,10 +32,11 @@
#include <uapi/linux/tc_act/tc_ife.h>
#include <net/tc_act/tc_ife.h>
#include <linux/etherdevice.h>
+#include <net/ife.h>
#define IFE_TAB_MASK 15
-static int ife_net_id;
+static unsigned int ife_net_id;
static int max_metacnt = IFE_META_MAX + 1;
static struct tc_action_ops act_ife_ops;
@@ -46,23 +47,6 @@ static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = {
[TCA_IFE_TYPE] = { .type = NLA_U16},
};
-/* Caller takes care of presenting data in network order
-*/
-int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
-{
- u32 *tlv = (u32 *)(skbdata);
- u16 totlen = nla_total_size(dlen); /*alignment + hdr */
- char *dptr = (char *)tlv + NLA_HDRLEN;
- u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
-
- *tlv = htonl(htlv);
- memset(dptr, 0, totlen - NLA_HDRLEN);
- memcpy(dptr, dval, dlen);
-
- return totlen;
-}
-EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
-
int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi)
{
u16 edata = 0;
@@ -637,69 +621,59 @@ int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
return 0;
}
-struct ifeheadr {
- __be16 metalen;
- u8 tlv_data[];
-};
-
-struct meta_tlvhdr {
- __be16 type;
- __be16 len;
-};
-
static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_ife_info *ife = to_ife(a);
int action = ife->tcf_action;
- struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data;
- int ifehdrln = (int)ifehdr->metalen;
- struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data);
+ u8 *ifehdr_end;
+ u8 *tlv_data;
+ u16 metalen;
spin_lock(&ife->tcf_lock);
bstats_update(&ife->tcf_bstats, skb);
tcf_lastuse_update(&ife->tcf_tm);
spin_unlock(&ife->tcf_lock);
- ifehdrln = ntohs(ifehdrln);
- if (unlikely(!pskb_may_pull(skb, ifehdrln))) {
+ if (skb_at_tc_ingress(skb))
+ skb_push(skb, skb->dev->hard_header_len);
+
+ tlv_data = ife_decode(skb, &metalen);
+ if (unlikely(!tlv_data)) {
spin_lock(&ife->tcf_lock);
ife->tcf_qstats.drops++;
spin_unlock(&ife->tcf_lock);
return TC_ACT_SHOT;
}
- skb_set_mac_header(skb, ifehdrln);
- __skb_pull(skb, ifehdrln);
- skb->protocol = eth_type_trans(skb, skb->dev);
- ifehdrln -= IFE_METAHDRLEN;
-
- while (ifehdrln > 0) {
- u8 *tlvdata = (u8 *)tlv;
- u16 mtype = tlv->type;
- u16 mlen = tlv->len;
- u16 alen;
+ ifehdr_end = tlv_data + metalen;
+ for (; tlv_data < ifehdr_end; tlv_data = ife_tlv_meta_next(tlv_data)) {
+ u8 *curr_data;
+ u16 mtype;
+ u16 dlen;
- mtype = ntohs(mtype);
- mlen = ntohs(mlen);
- alen = NLA_ALIGN(mlen);
+ curr_data = ife_tlv_meta_decode(tlv_data, &mtype, &dlen, NULL);
- if (find_decode_metaid(skb, ife, mtype, (mlen - NLA_HDRLEN),
- (void *)(tlvdata + NLA_HDRLEN))) {
+ if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) {
/* abuse overlimits to count when we receive metadata
* but dont have an ops for it
*/
- pr_info_ratelimited("Unknown metaid %d alnlen %d\n",
- mtype, mlen);
+ pr_info_ratelimited("Unknown metaid %d dlen %d\n",
+ mtype, dlen);
ife->tcf_qstats.overlimits++;
}
+ }
- tlvdata += alen;
- ifehdrln -= alen;
- tlv = (struct meta_tlvhdr *)tlvdata;
+ if (WARN_ON(tlv_data != ifehdr_end)) {
+ spin_lock(&ife->tcf_lock);
+ ife->tcf_qstats.drops++;
+ spin_unlock(&ife->tcf_lock);
+ return TC_ACT_SHOT;
}
+ skb->protocol = eth_type_trans(skb, skb->dev);
skb_reset_network_header(skb);
+
return action;
}
@@ -727,7 +701,6 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
struct tcf_ife_info *ife = to_ife(a);
int action = ife->tcf_action;
struct ethhdr *oethh; /* outer ether header */
- struct ethhdr *iethh; /* inner eth header */
struct tcf_meta_info *e;
/*
OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
@@ -735,13 +708,13 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
*/
u16 metalen = ife_get_sz(skb, ife);
int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
- unsigned int skboff = skb->dev->hard_header_len;
- u32 at = G_TC_AT(skb->tc_verd);
+ unsigned int skboff = 0;
int new_len = skb->len + hdrm;
bool exceed_mtu = false;
- int err;
+ void *ife_meta;
+ int err = 0;
- if (at & AT_EGRESS) {
+ if (!skb_at_tc_ingress(skb)) {
if (new_len > skb->dev->mtu)
exceed_mtu = true;
}
@@ -766,27 +739,10 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
return TC_ACT_SHOT;
}
- err = skb_cow_head(skb, hdrm);
- if (unlikely(err)) {
- ife->tcf_qstats.drops++;
- spin_unlock(&ife->tcf_lock);
- return TC_ACT_SHOT;
- }
-
- if (!(at & AT_EGRESS))
+ if (skb_at_tc_ingress(skb))
skb_push(skb, skb->dev->hard_header_len);
- iethh = (struct ethhdr *)skb->data;
- __skb_push(skb, hdrm);
- memcpy(skb->data, iethh, skb->mac_len);
- skb_reset_mac_header(skb);
- oethh = eth_hdr(skb);
-
- /*total metadata length */
- metalen += IFE_METAHDRLEN;
- metalen = htons(metalen);
- memcpy((skb->data + skboff), &metalen, IFE_METAHDRLEN);
- skboff += IFE_METAHDRLEN;
+ ife_meta = ife_encode(skb, metalen);
/* XXX: we dont have a clever way of telling encode to
* not repeat some of the computations that are done by
@@ -794,7 +750,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
*/
list_for_each_entry(e, &ife->metalist, metalist) {
if (e->ops->encode) {
- err = e->ops->encode(skb, (void *)(skb->data + skboff),
+ err = e->ops->encode(skb, (void *)(ife_meta + skboff),
e);
}
if (err < 0) {
@@ -805,18 +761,15 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
}
skboff += err;
}
+ oethh = (struct ethhdr *)skb->data;
if (!is_zero_ether_addr(ife->eth_src))
ether_addr_copy(oethh->h_source, ife->eth_src);
- else
- ether_addr_copy(oethh->h_source, iethh->h_source);
if (!is_zero_ether_addr(ife->eth_dst))
ether_addr_copy(oethh->h_dest, ife->eth_dst);
- else
- ether_addr_copy(oethh->h_dest, iethh->h_dest);
oethh->h_proto = htons(ife->eth_type);
- if (!(at & AT_EGRESS))
+ if (skb_at_tc_ingress(skb))
skb_pull(skb, skb->dev->hard_header_len);
spin_unlock(&ife->tcf_lock);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 378c1c976058..992ef8d624f1 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -30,10 +30,10 @@
#define IPT_TAB_MASK 15
-static int ipt_net_id;
+static unsigned int ipt_net_id;
static struct tc_action_ops act_ipt_ops;
-static int xt_net_id;
+static unsigned int xt_net_id;
static struct tc_action_ops act_xt_ops;
static int ipt_init_target(struct xt_entry_target *t, char *table,
@@ -213,6 +213,12 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
int ret = 0, result = 0;
struct tcf_ipt *ipt = to_ipt(a);
struct xt_action_param par;
+ struct nf_hook_state state = {
+ .net = dev_net(skb->dev),
+ .in = skb->dev,
+ .hook = ipt->tcfi_hook,
+ .pf = NFPROTO_IPV4,
+ };
if (skb_unclone(skb, GFP_ATOMIC))
return TC_ACT_UNSPEC;
@@ -226,13 +232,9 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
* worry later - danger - this API seems to have changed
* from earlier kernels
*/
- par.net = dev_net(skb->dev);
- par.in = skb->dev;
- par.out = NULL;
- par.hooknum = ipt->tcfi_hook;
+ par.state = &state;
par.target = ipt->tcfi_t->u.kernel.target;
par.targinfo = ipt->tcfi_t->data;
- par.family = NFPROTO_IPV4;
ret = par.target->target(skb, &par);
switch (ret) {
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 6b07fba5770b..af49c7dca860 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -21,18 +21,36 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/gfp.h>
+#include <linux/if_arp.h>
#include <net/net_namespace.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <linux/tc_act/tc_mirred.h>
#include <net/tc_act/tc_mirred.h>
-#include <linux/if_arp.h>
-
#define MIRRED_TAB_MASK 7
static LIST_HEAD(mirred_list);
static DEFINE_SPINLOCK(mirred_list_lock);
+static bool tcf_mirred_is_act_redirect(int action)
+{
+ return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR;
+}
+
+static bool tcf_mirred_act_wants_ingress(int action)
+{
+ switch (action) {
+ case TCA_EGRESS_REDIR:
+ case TCA_EGRESS_MIRROR:
+ return false;
+ case TCA_INGRESS_REDIR:
+ case TCA_INGRESS_MIRROR:
+ return true;
+ default:
+ BUG();
+ }
+}
+
static void tcf_mirred_release(struct tc_action *a, int bind)
{
struct tcf_mirred *m = to_mirred(a);
@@ -51,7 +69,7 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
[TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) },
};
-static int mirred_net_id;
+static unsigned int mirred_net_id;
static struct tc_action_ops act_mirred_ops;
static int tcf_mirred_init(struct net *net, struct nlattr *nla,
@@ -60,11 +78,12 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
{
struct tc_action_net *tn = net_generic(net, mirred_net_id);
struct nlattr *tb[TCA_MIRRED_MAX + 1];
+ bool mac_header_xmit = false;
struct tc_mirred *parm;
struct tcf_mirred *m;
struct net_device *dev;
- int ret, ok_push = 0;
bool exists = false;
+ int ret;
if (nla == NULL)
return -EINVAL;
@@ -82,6 +101,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
switch (parm->eaction) {
case TCA_EGRESS_MIRROR:
case TCA_EGRESS_REDIR:
+ case TCA_INGRESS_REDIR:
+ case TCA_INGRESS_MIRROR:
break;
default:
if (exists)
@@ -95,19 +116,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
tcf_hash_release(*a, bind);
return -ENODEV;
}
- switch (dev->type) {
- case ARPHRD_TUNNEL:
- case ARPHRD_TUNNEL6:
- case ARPHRD_SIT:
- case ARPHRD_IPGRE:
- case ARPHRD_VOID:
- case ARPHRD_NONE:
- ok_push = 0;
- break;
- default:
- ok_push = 1;
- break;
- }
+ mac_header_xmit = dev_is_mac_header_xmit(dev);
} else {
dev = NULL;
}
@@ -136,7 +145,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
dev_put(rcu_dereference_protected(m->tcfm_dev, 1));
dev_hold(dev);
rcu_assign_pointer(m->tcfm_dev, dev);
- m->tcfm_ok_push = ok_push;
+ m->tcfm_mac_header_xmit = mac_header_xmit;
}
if (ret == ACT_P_CREATED) {
@@ -153,15 +162,19 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_mirred *m = to_mirred(a);
+ bool m_mac_header_xmit;
struct net_device *dev;
struct sk_buff *skb2;
- int retval, err;
- u32 at;
+ int retval, err = 0;
+ int m_eaction;
+ int mac_len;
tcf_lastuse_update(&m->tcf_tm);
bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
rcu_read_lock();
+ m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
+ m_eaction = READ_ONCE(m->tcfm_eaction);
retval = READ_ONCE(m->tcf_action);
dev = rcu_dereference(m->tcfm_dev);
if (unlikely(!dev)) {
@@ -175,28 +188,43 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
goto out;
}
- at = G_TC_AT(skb->tc_verd);
skb2 = skb_clone(skb, GFP_ATOMIC);
if (!skb2)
goto out;
- if (!(at & AT_EGRESS)) {
- if (m->tcfm_ok_push)
+ /* If action's target direction differs than filter's direction,
+ * and devices expect a mac header on xmit, then mac push/pull is
+ * needed.
+ */
+ if (skb_at_tc_ingress(skb) != tcf_mirred_act_wants_ingress(m_eaction) &&
+ m_mac_header_xmit) {
+ if (!skb_at_tc_ingress(skb)) {
+ /* caught at egress, act ingress: pull mac */
+ mac_len = skb_network_header(skb) - skb_mac_header(skb);
+ skb_pull_rcsum(skb2, mac_len);
+ } else {
+ /* caught at ingress, act egress: push mac */
skb_push_rcsum(skb2, skb->mac_len);
+ }
}
/* mirror is always swallowed */
- if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
- skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
+ if (tcf_mirred_is_act_redirect(m_eaction)) {
+ skb2->tc_redirected = 1;
+ skb2->tc_from_ingress = skb2->tc_at_ingress;
+ }
skb2->skb_iif = skb->dev->ifindex;
skb2->dev = dev;
- err = dev_queue_xmit(skb2);
+ if (!tcf_mirred_act_wants_ingress(m_eaction))
+ err = dev_queue_xmit(skb2);
+ else
+ err = netif_receive_skb(skb2);
if (err) {
out:
qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats));
- if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
+ if (tcf_mirred_is_act_redirect(m_eaction))
retval = TC_ACT_SHOT;
}
rcu_read_unlock();
@@ -286,6 +314,17 @@ static struct notifier_block mirred_device_notifier = {
.notifier_call = mirred_device_event,
};
+static int tcf_mirred_device(const struct tc_action *a, struct net *net,
+ struct net_device **mirred_dev)
+{
+ int ifindex = tcf_mirred_ifindex(a);
+
+ *mirred_dev = __dev_get_by_index(net, ifindex);
+ if (!*mirred_dev)
+ return -EINVAL;
+ return 0;
+}
+
static struct tc_action_ops act_mirred_ops = {
.kind = "mirred",
.type = TCA_ACT_MIRRED,
@@ -298,6 +337,7 @@ static struct tc_action_ops act_mirred_ops = {
.walk = tcf_mirred_walker,
.lookup = tcf_mirred_search,
.size = sizeof(struct tcf_mirred),
+ .get_dev = tcf_mirred_device,
};
static __net_init int mirred_init_net(struct net *net)
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 8e8b0cc30704..9b6aec665495 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -31,7 +31,7 @@
#define NAT_TAB_MASK 15
-static int nat_net_id;
+static unsigned int nat_net_id;
static struct tc_action_ops act_nat_ops;
static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index cf9b2fe8eac6..c1310472f620 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -22,26 +22,126 @@
#include <net/pkt_sched.h>
#include <linux/tc_act/tc_pedit.h>
#include <net/tc_act/tc_pedit.h>
+#include <uapi/linux/tc_act/tc_pedit.h>
#define PEDIT_TAB_MASK 15
-static int pedit_net_id;
+static unsigned int pedit_net_id;
static struct tc_action_ops act_pedit_ops;
static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
[TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) },
+ [TCA_PEDIT_KEYS_EX] = { .type = NLA_NESTED },
};
+static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = {
+ [TCA_PEDIT_KEY_EX_HTYPE] = { .type = NLA_U16 },
+ [TCA_PEDIT_KEY_EX_CMD] = { .type = NLA_U16 },
+};
+
+static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
+ u8 n)
+{
+ struct tcf_pedit_key_ex *keys_ex;
+ struct tcf_pedit_key_ex *k;
+ const struct nlattr *ka;
+ int err = -EINVAL;
+ int rem;
+
+ if (!nla || !n)
+ return NULL;
+
+ keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL);
+ if (!keys_ex)
+ return ERR_PTR(-ENOMEM);
+
+ k = keys_ex;
+
+ nla_for_each_nested(ka, nla, rem) {
+ struct nlattr *tb[TCA_PEDIT_KEY_EX_MAX + 1];
+
+ if (!n) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ n--;
+
+ if (nla_type(ka) != TCA_PEDIT_KEY_EX) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ err = nla_parse_nested(tb, TCA_PEDIT_KEY_EX_MAX, ka,
+ pedit_key_ex_policy);
+ if (err)
+ goto err_out;
+
+ if (!tb[TCA_PEDIT_KEY_EX_HTYPE] ||
+ !tb[TCA_PEDIT_KEY_EX_CMD]) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]);
+ k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]);
+
+ if (k->htype > TCA_PEDIT_HDR_TYPE_MAX ||
+ k->cmd > TCA_PEDIT_CMD_MAX) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ k++;
+ }
+
+ if (n)
+ goto err_out;
+
+ return keys_ex;
+
+err_out:
+ kfree(keys_ex);
+ return ERR_PTR(err);
+}
+
+static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
+ struct tcf_pedit_key_ex *keys_ex, int n)
+{
+ struct nlattr *keys_start = nla_nest_start(skb, TCA_PEDIT_KEYS_EX);
+
+ for (; n > 0; n--) {
+ struct nlattr *key_start;
+
+ key_start = nla_nest_start(skb, TCA_PEDIT_KEY_EX);
+
+ if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) ||
+ nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) {
+ nlmsg_trim(skb, keys_start);
+ return -EINVAL;
+ }
+
+ nla_nest_end(skb, key_start);
+
+ keys_ex++;
+ }
+
+ nla_nest_end(skb, keys_start);
+
+ return 0;
+}
+
static int tcf_pedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind)
{
struct tc_action_net *tn = net_generic(net, pedit_net_id);
struct nlattr *tb[TCA_PEDIT_MAX + 1];
+ struct nlattr *pattr;
struct tc_pedit *parm;
int ret = 0, err;
struct tcf_pedit *p;
struct tc_pedit_key *keys = NULL;
+ struct tcf_pedit_key_ex *keys_ex;
int ksize;
if (nla == NULL)
@@ -51,13 +151,21 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
if (err < 0)
return err;
- if (tb[TCA_PEDIT_PARMS] == NULL)
+ pattr = tb[TCA_PEDIT_PARMS];
+ if (!pattr)
+ pattr = tb[TCA_PEDIT_PARMS_EX];
+ if (!pattr)
return -EINVAL;
- parm = nla_data(tb[TCA_PEDIT_PARMS]);
+
+ parm = nla_data(pattr);
ksize = parm->nkeys * sizeof(struct tc_pedit_key);
- if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize)
+ if (nla_len(pattr) < sizeof(*parm) + ksize)
return -EINVAL;
+ keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys);
+ if (IS_ERR(keys_ex))
+ return PTR_ERR(keys_ex);
+
if (!tcf_hash_check(tn, parm->index, a, bind)) {
if (!parm->nkeys)
return -EINVAL;
@@ -69,6 +177,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
keys = kmalloc(ksize, GFP_KERNEL);
if (keys == NULL) {
tcf_hash_cleanup(*a, est);
+ kfree(keys_ex);
return -ENOMEM;
}
ret = ACT_P_CREATED;
@@ -81,8 +190,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
p = to_pedit(*a);
if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
keys = kmalloc(ksize, GFP_KERNEL);
- if (keys == NULL)
+ if (!keys) {
+ kfree(keys_ex);
return -ENOMEM;
+ }
}
}
@@ -95,6 +206,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
p->tcfp_nkeys = parm->nkeys;
}
memcpy(p->tcfp_keys, parm->keys, ksize);
+
+ kfree(p->tcfp_keys_ex);
+ p->tcfp_keys_ex = keys_ex;
+
spin_unlock_bh(&p->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(tn, *a);
@@ -106,6 +221,7 @@ static void tcf_pedit_cleanup(struct tc_action *a, int bind)
struct tcf_pedit *p = to_pedit(a);
struct tc_pedit_key *keys = p->tcfp_keys;
kfree(keys);
+ kfree(p->tcfp_keys_ex);
}
static bool offset_valid(struct sk_buff *skb, int offset)
@@ -119,38 +235,88 @@ static bool offset_valid(struct sk_buff *skb, int offset)
return true;
}
+static int pedit_skb_hdr_offset(struct sk_buff *skb,
+ enum pedit_header_type htype, int *hoffset)
+{
+ int ret = -EINVAL;
+
+ switch (htype) {
+ case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH:
+ if (skb_mac_header_was_set(skb)) {
+ *hoffset = skb_mac_offset(skb);
+ ret = 0;
+ }
+ break;
+ case TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK:
+ case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4:
+ case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6:
+ *hoffset = skb_network_offset(skb);
+ ret = 0;
+ break;
+ case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP:
+ case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP:
+ if (skb_transport_header_was_set(skb)) {
+ *hoffset = skb_transport_offset(skb);
+ ret = 0;
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ };
+
+ return ret;
+}
+
static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_pedit *p = to_pedit(a);
int i;
- unsigned int off;
if (skb_unclone(skb, GFP_ATOMIC))
return p->tcf_action;
- off = skb_network_offset(skb);
-
spin_lock(&p->tcf_lock);
tcf_lastuse_update(&p->tcf_tm);
if (p->tcfp_nkeys > 0) {
struct tc_pedit_key *tkey = p->tcfp_keys;
+ struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex;
+ enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
+ enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
u32 *ptr, _data;
int offset = tkey->off;
+ int hoffset;
+ u32 val;
+ int rc;
+
+ if (tkey_ex) {
+ htype = tkey_ex->htype;
+ cmd = tkey_ex->cmd;
+
+ tkey_ex++;
+ }
+
+ rc = pedit_skb_hdr_offset(skb, htype, &hoffset);
+ if (rc) {
+ pr_info("tc filter pedit bad header type specified (0x%x)\n",
+ htype);
+ goto bad;
+ }
if (tkey->offmask) {
char *d, _d;
- if (!offset_valid(skb, off + tkey->at)) {
+ if (!offset_valid(skb, hoffset + tkey->at)) {
pr_info("tc filter pedit 'at' offset %d out of bounds\n",
- off + tkey->at);
+ hoffset + tkey->at);
goto bad;
}
- d = skb_header_pointer(skb, off + tkey->at, 1,
+ d = skb_header_pointer(skb, hoffset + tkey->at, 1,
&_d);
if (!d)
goto bad;
@@ -163,19 +329,32 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
goto bad;
}
- if (!offset_valid(skb, off + offset)) {
+ if (!offset_valid(skb, hoffset + offset)) {
pr_info("tc filter pedit offset %d out of bounds\n",
- offset);
+ hoffset + offset);
goto bad;
}
- ptr = skb_header_pointer(skb, off + offset, 4, &_data);
+ ptr = skb_header_pointer(skb, hoffset + offset, 4, &_data);
if (!ptr)
goto bad;
/* just do it, baby */
- *ptr = ((*ptr & tkey->mask) ^ tkey->val);
+ switch (cmd) {
+ case TCA_PEDIT_KEY_EX_CMD_SET:
+ val = tkey->val;
+ break;
+ case TCA_PEDIT_KEY_EX_CMD_ADD:
+ val = (*ptr + tkey->val) & ~tkey->mask;
+ break;
+ default:
+ pr_info("tc filter pedit bad command (%d)\n",
+ cmd);
+ goto bad;
+ }
+
+ *ptr = ((*ptr & tkey->mask) ^ val);
if (ptr == &_data)
- skb_store_bits(skb, off + offset, ptr, 4);
+ skb_store_bits(skb, hoffset + offset, ptr, 4);
}
goto done;
@@ -215,8 +394,15 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
opt->refcnt = p->tcf_refcnt - ref;
opt->bindcnt = p->tcf_bindcnt - bind;
- if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
- goto nla_put_failure;
+ if (p->tcfp_keys_ex) {
+ tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys);
+
+ if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt))
+ goto nla_put_failure;
+ } else {
+ if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
+ goto nla_put_failure;
+ }
tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index d1bd248fe146..0ba91d1ce994 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -55,7 +55,7 @@ struct tc_police_compat {
/* Each policer is serialized by its individual spinlock */
-static int police_net_id;
+static unsigned int police_net_id;
static struct tc_action_ops act_police_ops;
static int tcf_act_police_walker(struct net *net, struct sk_buff *skb,
@@ -142,8 +142,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
goto failure_unlock;
} else if (tb[TCA_POLICE_AVRATE] &&
(ret == ACT_P_CREATED ||
- !gen_estimator_active(&police->tcf_bstats,
- &police->tcf_rate_est))) {
+ !gen_estimator_active(&police->tcf_rate_est))) {
err = -EINVAL;
goto failure_unlock;
}
@@ -216,13 +215,17 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
bstats_update(&police->tcf_bstats, skb);
tcf_lastuse_update(&police->tcf_tm);
- if (police->tcfp_ewma_rate &&
- police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
- police->tcf_qstats.overlimits++;
- if (police->tcf_action == TC_ACT_SHOT)
- police->tcf_qstats.drops++;
- spin_unlock(&police->tcf_lock);
- return police->tcf_action;
+ if (police->tcfp_ewma_rate) {
+ struct gnet_stats_rate_est64 sample;
+
+ if (!gen_estimator_read(&police->tcf_rate_est, &sample) ||
+ sample.bps >= police->tcfp_ewma_rate) {
+ police->tcf_qstats.overlimits++;
+ if (police->tcf_action == TC_ACT_SHOT)
+ police->tcf_qstats.drops++;
+ spin_unlock(&police->tcf_lock);
+ return police->tcf_action;
+ }
}
if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
new file mode 100644
index 000000000000..0b8217b4763f
--- /dev/null
+++ b/net/sched/act_sample.c
@@ -0,0 +1,276 @@
+/*
+ * net/sched/act_sample.c - Packet sampling tc action
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_sample.h>
+#include <net/tc_act/tc_sample.h>
+#include <net/psample.h>
+
+#include <linux/if_arp.h>
+
+#define SAMPLE_TAB_MASK 7
+static unsigned int sample_net_id;
+static struct tc_action_ops act_sample_ops;
+
+static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
+ [TCA_SAMPLE_PARMS] = { .len = sizeof(struct tc_sample) },
+ [TCA_SAMPLE_RATE] = { .type = NLA_U32 },
+ [TCA_SAMPLE_TRUNC_SIZE] = { .type = NLA_U32 },
+ [TCA_SAMPLE_PSAMPLE_GROUP] = { .type = NLA_U32 },
+};
+
+static int tcf_sample_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a, int ovr,
+ int bind)
+{
+ struct tc_action_net *tn = net_generic(net, sample_net_id);
+ struct nlattr *tb[TCA_SAMPLE_MAX + 1];
+ struct psample_group *psample_group;
+ struct tc_sample *parm;
+ struct tcf_sample *s;
+ bool exists = false;
+ int ret;
+
+ if (!nla)
+ return -EINVAL;
+ ret = nla_parse_nested(tb, TCA_SAMPLE_MAX, nla, sample_policy);
+ if (ret < 0)
+ return ret;
+ if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] ||
+ !tb[TCA_SAMPLE_PSAMPLE_GROUP])
+ return -EINVAL;
+
+ parm = nla_data(tb[TCA_SAMPLE_PARMS]);
+
+ exists = tcf_hash_check(tn, parm->index, a, bind);
+ if (exists && bind)
+ return 0;
+
+ if (!exists) {
+ ret = tcf_hash_create(tn, parm->index, est, a,
+ &act_sample_ops, bind, false);
+ if (ret)
+ return ret;
+ ret = ACT_P_CREATED;
+ } else {
+ tcf_hash_release(*a, bind);
+ if (!ovr)
+ return -EEXIST;
+ }
+ s = to_sample(*a);
+
+ s->tcf_action = parm->action;
+ s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
+ s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
+ psample_group = psample_group_get(net, s->psample_group_num);
+ if (!psample_group) {
+ if (ret == ACT_P_CREATED)
+ tcf_hash_release(*a, bind);
+ return -ENOMEM;
+ }
+ RCU_INIT_POINTER(s->psample_group, psample_group);
+
+ if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
+ s->truncate = true;
+ s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]);
+ }
+
+ if (ret == ACT_P_CREATED)
+ tcf_hash_insert(tn, *a);
+ return ret;
+}
+
+static void tcf_sample_cleanup_rcu(struct rcu_head *rcu)
+{
+ struct tcf_sample *s = container_of(rcu, struct tcf_sample, rcu);
+ struct psample_group *psample_group;
+
+ psample_group = rcu_dereference_protected(s->psample_group, 1);
+ RCU_INIT_POINTER(s->psample_group, NULL);
+ psample_group_put(psample_group);
+}
+
+static void tcf_sample_cleanup(struct tc_action *a, int bind)
+{
+ struct tcf_sample *s = to_sample(a);
+
+ call_rcu(&s->rcu, tcf_sample_cleanup_rcu);
+}
+
+static bool tcf_sample_dev_ok_push(struct net_device *dev)
+{
+ switch (dev->type) {
+ case ARPHRD_TUNNEL:
+ case ARPHRD_TUNNEL6:
+ case ARPHRD_SIT:
+ case ARPHRD_IPGRE:
+ case ARPHRD_VOID:
+ case ARPHRD_NONE:
+ return false;
+ default:
+ return true;
+ }
+}
+
+static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_sample *s = to_sample(a);
+ struct psample_group *psample_group;
+ int retval;
+ int size;
+ int iif;
+ int oif;
+
+ tcf_lastuse_update(&s->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb);
+ retval = READ_ONCE(s->tcf_action);
+
+ rcu_read_lock();
+ psample_group = rcu_dereference(s->psample_group);
+
+ /* randomly sample packets according to rate */
+ if (psample_group && (prandom_u32() % s->rate == 0)) {
+ if (!skb_at_tc_ingress(skb)) {
+ iif = skb->skb_iif;
+ oif = skb->dev->ifindex;
+ } else {
+ iif = skb->dev->ifindex;
+ oif = 0;
+ }
+
+ /* on ingress, the mac header gets popped, so push it back */
+ if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
+ skb_push(skb, skb->mac_len);
+
+ size = s->truncate ? s->trunc_size : skb->len;
+ psample_sample_packet(psample_group, skb, size, iif, oif,
+ s->rate);
+
+ if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
+ skb_pull(skb, skb->mac_len);
+ }
+
+ rcu_read_unlock();
+ return retval;
+}
+
+static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_sample *s = to_sample(a);
+ struct tc_sample opt = {
+ .index = s->tcf_index,
+ .action = s->tcf_action,
+ .refcnt = s->tcf_refcnt - ref,
+ .bindcnt = s->tcf_bindcnt - bind,
+ };
+ struct tcf_t t;
+
+ if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &s->tcf_tm);
+ if (nla_put_64bit(skb, TCA_SAMPLE_TM, sizeof(t), &t, TCA_SAMPLE_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_SAMPLE_RATE, s->rate))
+ goto nla_put_failure;
+
+ if (s->truncate)
+ if (nla_put_u32(skb, TCA_SAMPLE_TRUNC_SIZE, s->trunc_size))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_sample_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops)
+{
+ struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_sample_ops = {
+ .kind = "sample",
+ .type = TCA_ACT_SAMPLE,
+ .owner = THIS_MODULE,
+ .act = tcf_sample_act,
+ .dump = tcf_sample_dump,
+ .init = tcf_sample_init,
+ .cleanup = tcf_sample_cleanup,
+ .walk = tcf_sample_walker,
+ .lookup = tcf_sample_search,
+ .size = sizeof(struct tcf_sample),
+};
+
+static __net_init int sample_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+ return tc_action_net_init(tn, &act_sample_ops, SAMPLE_TAB_MASK);
+}
+
+static void __net_exit sample_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations sample_net_ops = {
+ .init = sample_init_net,
+ .exit = sample_exit_net,
+ .id = &sample_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init sample_init_module(void)
+{
+ return tcf_register_action(&act_sample_ops, &sample_net_ops);
+}
+
+static void __exit sample_cleanup_module(void)
+{
+ tcf_unregister_action(&act_sample_ops, &sample_net_ops);
+}
+
+module_init(sample_init_module);
+module_exit(sample_cleanup_module);
+
+MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
+MODULE_DESCRIPTION("Packet sampling action");
+MODULE_LICENSE("GPL v2");
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 289af6f9bb3b..823a73ad0c60 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -26,7 +26,7 @@
#define SIMP_TAB_MASK 7
-static int simp_net_id;
+static unsigned int simp_net_id;
static struct tc_action_ops act_simp_ops;
#define SIMP_MAX_DATA 32
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index a133dcb82132..06ccae3c12ee 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -29,7 +29,7 @@
#define SKBEDIT_TAB_MASK 15
-static int skbedit_net_id;
+static unsigned int skbedit_net_id;
static struct tc_action_ops act_skbedit_ops;
static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
@@ -46,8 +46,10 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
skb->dev->real_num_tx_queues > d->queue_mapping)
skb_set_queue_mapping(skb, d->queue_mapping);
- if (d->flags & SKBEDIT_F_MARK)
- skb->mark = d->mark;
+ if (d->flags & SKBEDIT_F_MARK) {
+ skb->mark &= ~d->mask;
+ skb->mark |= d->mark & d->mask;
+ }
if (d->flags & SKBEDIT_F_PTYPE)
skb->pkt_type = d->ptype;
@@ -61,6 +63,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
[TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) },
[TCA_SKBEDIT_MARK] = { .len = sizeof(u32) },
[TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) },
+ [TCA_SKBEDIT_MASK] = { .len = sizeof(u32) },
};
static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
@@ -71,7 +74,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
struct tc_skbedit *parm;
struct tcf_skbedit *d;
- u32 flags = 0, *priority = NULL, *mark = NULL;
+ u32 flags = 0, *priority = NULL, *mark = NULL, *mask = NULL;
u16 *queue_mapping = NULL, *ptype = NULL;
bool exists = false;
int ret = 0, err;
@@ -108,6 +111,11 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
mark = nla_data(tb[TCA_SKBEDIT_MARK]);
}
+ if (tb[TCA_SKBEDIT_MASK] != NULL) {
+ flags |= SKBEDIT_F_MASK;
+ mask = nla_data(tb[TCA_SKBEDIT_MASK]);
+ }
+
parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
exists = tcf_hash_check(tn, parm->index, a, bind);
@@ -145,6 +153,10 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
d->mark = *mark;
if (flags & SKBEDIT_F_PTYPE)
d->ptype = *ptype;
+ /* default behaviour is to use all the bits */
+ d->mask = 0xffffffff;
+ if (flags & SKBEDIT_F_MASK)
+ d->mask = *mask;
d->tcf_action = parm->action;
@@ -182,6 +194,9 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
if ((d->flags & SKBEDIT_F_PTYPE) &&
nla_put_u16(skb, TCA_SKBEDIT_PTYPE, d->ptype))
goto nla_put_failure;
+ if ((d->flags & SKBEDIT_F_MASK) &&
+ nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask))
+ goto nla_put_failure;
tcf_tm_dump(&t, &d->tcf_tm);
if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index e7d96381c908..c736627f8f4a 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -22,7 +22,7 @@
#define SKBMOD_TAB_MASK 15
-static int skbmod_net_id;
+static unsigned int skbmod_net_id;
static struct tc_action_ops act_skbmod_ops;
#define MAX_EDIT_LEN ETH_HLEN
@@ -228,7 +228,6 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
return skb->len;
nla_put_failure:
- rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index af47bdf2f483..e3a58e021198 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -16,14 +16,13 @@
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
-#include <net/dst_metadata.h>
#include <linux/tc_act/tc_tunnel_key.h>
#include <net/tc_act/tc_tunnel_key.h>
#define TUNNEL_KEY_TAB_MASK 15
-static int tunnel_key_net_id;
+static unsigned int tunnel_key_net_id;
static struct tc_action_ops act_tunnel_key_ops;
static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
@@ -67,6 +66,7 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
[TCA_TUNNEL_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
[TCA_TUNNEL_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) },
[TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 },
+ [TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16},
};
static int tunnel_key_init(struct net *net, struct nlattr *nla,
@@ -81,6 +81,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
struct tc_tunnel_key *parm;
struct tcf_tunnel_key *t;
bool exists = false;
+ __be16 dst_port = 0;
__be64 key_id;
int ret = 0;
int err;
@@ -111,6 +112,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]));
+ if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT])
+ dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]);
+
if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
__be32 saddr;
@@ -120,7 +124,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
metadata = __ip_tun_set_dst(saddr, daddr, 0, 0,
- TUNNEL_KEY, key_id, 0);
+ dst_port, TUNNEL_KEY,
+ key_id, 0);
} else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) {
struct in6_addr saddr;
@@ -129,8 +134,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
- metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, 0,
- TUNNEL_KEY, key_id, 0);
+ metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, dst_port,
+ 0, TUNNEL_KEY,
+ key_id, 0);
}
if (!metadata) {
@@ -258,7 +264,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
tunnel_key_dump_addresses(skb,
- &params->tcft_enc_metadata->u.tun_info))
+ &params->tcft_enc_metadata->u.tun_info) ||
+ nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst))
goto nla_put_failure;
}
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index b57fcbcefea1..19e0dba305ce 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -21,7 +21,7 @@
#define VLAN_TAB_MASK 15
-static int vlan_net_id;
+static unsigned int vlan_net_id;
static struct tc_action_ops act_vlan_ops;
static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index b05d4a2155b0..732f7cae459d 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -19,6 +19,7 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
+#include <linux/err.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/kmod.h>
@@ -38,14 +39,14 @@ static DEFINE_RWLOCK(cls_mod_lock);
/* Find classifier type by string name */
-static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind)
+static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind)
{
const struct tcf_proto_ops *t, *res = NULL;
if (kind) {
read_lock(&cls_mod_lock);
list_for_each_entry(t, &tcf_proto_base, head) {
- if (nla_strcmp(kind, t->kind) == 0) {
+ if (strcmp(kind, t->kind) == 0) {
if (try_module_get(t->owner))
res = t;
break;
@@ -127,6 +128,77 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
return first;
}
+static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
+ u32 prio, u32 parent, struct Qdisc *q)
+{
+ struct tcf_proto *tp;
+ int err;
+
+ tp = kzalloc(sizeof(*tp), GFP_KERNEL);
+ if (!tp)
+ return ERR_PTR(-ENOBUFS);
+
+ err = -ENOENT;
+ tp->ops = tcf_proto_lookup_ops(kind);
+ if (!tp->ops) {
+#ifdef CONFIG_MODULES
+ rtnl_unlock();
+ request_module("cls_%s", kind);
+ rtnl_lock();
+ tp->ops = tcf_proto_lookup_ops(kind);
+ /* We dropped the RTNL semaphore in order to perform
+ * the module load. So, even if we succeeded in loading
+ * the module we have to replay the request. We indicate
+ * this using -EAGAIN.
+ */
+ if (tp->ops) {
+ module_put(tp->ops->owner);
+ err = -EAGAIN;
+ } else {
+ err = -ENOENT;
+ }
+ goto errout;
+#endif
+ }
+ tp->classify = tp->ops->classify;
+ tp->protocol = protocol;
+ tp->prio = prio;
+ tp->classid = parent;
+ tp->q = q;
+
+ err = tp->ops->init(tp);
+ if (err) {
+ module_put(tp->ops->owner);
+ goto errout;
+ }
+ return tp;
+
+errout:
+ kfree(tp);
+ return ERR_PTR(err);
+}
+
+static bool tcf_proto_destroy(struct tcf_proto *tp, bool force)
+{
+ if (tp->ops->destroy(tp, force)) {
+ module_put(tp->ops->owner);
+ kfree_rcu(tp, rcu);
+ return true;
+ }
+ return false;
+}
+
+void tcf_destroy_chain(struct tcf_proto __rcu **fl)
+{
+ struct tcf_proto *tp;
+
+ while ((tp = rtnl_dereference(*fl)) != NULL) {
+ RCU_INIT_POINTER(*fl, tp->next);
+ tcf_proto_destroy(tp, true);
+ }
+}
+EXPORT_SYMBOL(tcf_destroy_chain);
+
/* Add/change/delete/get a filter node */
static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
@@ -142,19 +214,21 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
struct Qdisc *q;
struct tcf_proto __rcu **back;
struct tcf_proto __rcu **chain;
+ struct tcf_proto *next;
struct tcf_proto *tp;
- const struct tcf_proto_ops *tp_ops;
const struct Qdisc_class_ops *cops;
unsigned long cl;
unsigned long fh;
int err;
- int tp_created = 0;
+ int tp_created;
if ((n->nlmsg_type != RTM_GETTFILTER) &&
!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;
replay:
+ tp_created = 0;
+
err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
if (err < 0)
return err;
@@ -220,9 +294,10 @@ replay:
/* And the last stroke */
chain = cops->tcf_chain(q, cl);
- err = -EINVAL;
- if (chain == NULL)
+ if (chain == NULL) {
+ err = -EINVAL;
goto errout;
+ }
if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER);
tcf_destroy_chain(chain);
@@ -237,10 +312,13 @@ replay:
if (tp->prio >= prio) {
if (tp->prio == prio) {
if (!nprio ||
- (tp->protocol != protocol && protocol))
+ (tp->protocol != protocol && protocol)) {
+ err = -EINVAL;
goto errout;
- } else
+ }
+ } else {
tp = NULL;
+ }
break;
}
}
@@ -248,109 +326,69 @@ replay:
if (tp == NULL) {
/* Proto-tcf does not exist, create new one */
- if (tca[TCA_KIND] == NULL || !protocol)
+ if (tca[TCA_KIND] == NULL || !protocol) {
+ err = -EINVAL;
goto errout;
+ }
- err = -ENOENT;
if (n->nlmsg_type != RTM_NEWTFILTER ||
- !(n->nlmsg_flags & NLM_F_CREATE))
+ !(n->nlmsg_flags & NLM_F_CREATE)) {
+ err = -ENOENT;
goto errout;
+ }
+ if (!nprio)
+ nprio = TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
- /* Create new proto tcf */
-
- err = -ENOBUFS;
- tp = kzalloc(sizeof(*tp), GFP_KERNEL);
- if (tp == NULL)
- goto errout;
- err = -ENOENT;
- tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
- if (tp_ops == NULL) {
-#ifdef CONFIG_MODULES
- struct nlattr *kind = tca[TCA_KIND];
- char name[IFNAMSIZ];
-
- if (kind != NULL &&
- nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
- rtnl_unlock();
- request_module("cls_%s", name);
- rtnl_lock();
- tp_ops = tcf_proto_lookup_ops(kind);
- /* We dropped the RTNL semaphore in order to
- * perform the module load. So, even if we
- * succeeded in loading the module we have to
- * replay the request. We indicate this using
- * -EAGAIN.
- */
- if (tp_ops != NULL) {
- module_put(tp_ops->owner);
- err = -EAGAIN;
- }
- }
-#endif
- kfree(tp);
+ tp = tcf_proto_create(nla_data(tca[TCA_KIND]),
+ protocol, nprio, parent, q);
+ if (IS_ERR(tp)) {
+ err = PTR_ERR(tp);
goto errout;
}
- tp->ops = tp_ops;
- tp->protocol = protocol;
- tp->prio = nprio ? :
- TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
- tp->q = q;
- tp->classify = tp_ops->classify;
- tp->classid = parent;
-
- err = tp_ops->init(tp);
- if (err != 0) {
- module_put(tp_ops->owner);
- kfree(tp);
- goto errout;
- }
-
tp_created = 1;
-
- } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind))
+ } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
+ err = -EINVAL;
goto errout;
+ }
fh = tp->ops->get(tp, t->tcm_handle);
if (fh == 0) {
if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
- struct tcf_proto *next = rtnl_dereference(tp->next);
-
+ next = rtnl_dereference(tp->next);
RCU_INIT_POINTER(*back, next);
-
tfilter_notify(net, skb, n, tp, fh,
RTM_DELTFILTER, false);
- tcf_destroy(tp, true);
+ tcf_proto_destroy(tp, true);
err = 0;
goto errout;
}
- err = -ENOENT;
if (n->nlmsg_type != RTM_NEWTFILTER ||
- !(n->nlmsg_flags & NLM_F_CREATE))
+ !(n->nlmsg_flags & NLM_F_CREATE)) {
+ err = -ENOENT;
goto errout;
+ }
} else {
switch (n->nlmsg_type) {
case RTM_NEWTFILTER:
- err = -EEXIST;
if (n->nlmsg_flags & NLM_F_EXCL) {
if (tp_created)
- tcf_destroy(tp, true);
+ tcf_proto_destroy(tp, true);
+ err = -EEXIST;
goto errout;
}
break;
case RTM_DELTFILTER:
err = tp->ops->delete(tp, fh);
- if (err == 0) {
- struct tcf_proto *next = rtnl_dereference(tp->next);
-
- tfilter_notify(net, skb, n, tp,
- t->tcm_handle,
- RTM_DELTFILTER, false);
- if (tcf_destroy(tp, false))
- RCU_INIT_POINTER(*back, next);
- }
+ if (err)
+ goto errout;
+ next = rtnl_dereference(tp->next);
+ tfilter_notify(net, skb, n, tp, t->tcm_handle,
+ RTM_DELTFILTER, false);
+ if (tcf_proto_destroy(tp, false))
+ RCU_INIT_POINTER(*back, next);
goto errout;
case RTM_GETTFILTER:
err = tfilter_notify(net, skb, n, tp, fh,
@@ -372,7 +410,7 @@ replay:
tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false);
} else {
if (tp_created)
- tcf_destroy(tp, true);
+ tcf_proto_destroy(tp, true);
}
errout:
@@ -682,6 +720,30 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
}
EXPORT_SYMBOL(tcf_exts_dump_stats);
+int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
+ struct net_device **hw_dev)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ const struct tc_action *a;
+ LIST_HEAD(actions);
+
+ if (tc_no_actions(exts))
+ return -EINVAL;
+
+ tcf_exts_to_list(exts, &actions);
+ list_for_each_entry(a, &actions, list) {
+ if (a->ops->get_dev) {
+ a->ops->get_dev(a, dev_net(dev), hw_dev);
+ break;
+ }
+ }
+ if (*hw_dev)
+ return 0;
+#endif
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(tcf_exts_get_dev);
+
static int __init tc_filter_init(void)
{
rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 0a47ba5e6109..80f688436dd7 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -45,10 +45,7 @@ struct cls_bpf_prog {
u32 gen_flags;
struct tcf_exts exts;
u32 handle;
- union {
- u32 bpf_fd;
- u16 bpf_num_ops;
- };
+ u16 bpf_num_ops;
struct sock_filter *bpf_ops;
const char *bpf_name;
struct tcf_proto *tp;
@@ -151,6 +148,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_bpf_offload bpf_offload = {};
struct tc_to_netdev offload;
+ int err;
offload.type = TC_SETUP_CLSBPF;
offload.cls_bpf = &bpf_offload;
@@ -162,8 +160,13 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
bpf_offload.exts_integrated = prog->exts_integrated;
bpf_offload.gen_flags = prog->gen_flags;
- return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
- tp->protocol, &offload);
+ err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+ tp->protocol, &offload);
+
+ if (!err && (cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE))
+ prog->gen_flags |= TCA_CLS_FLAGS_IN_HW;
+
+ return err;
}
static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
@@ -244,7 +247,7 @@ static int cls_bpf_init(struct tcf_proto *tp)
return 0;
}
-static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
+static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog)
{
tcf_exts_destroy(&prog->exts);
@@ -258,22 +261,22 @@ static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
kfree(prog);
}
-static void __cls_bpf_delete_prog(struct rcu_head *rcu)
+static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu)
{
- struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu);
-
- cls_bpf_delete_prog(prog->tp, prog);
+ __cls_bpf_delete_prog(container_of(rcu, struct cls_bpf_prog, rcu));
}
-static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
+static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog)
{
- struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg;
-
cls_bpf_stop_offload(tp, prog);
list_del_rcu(&prog->link);
tcf_unbind_filter(tp, &prog->res);
- call_rcu(&prog->rcu, __cls_bpf_delete_prog);
+ call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu);
+}
+static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ __cls_bpf_delete(tp, (struct cls_bpf_prog *) arg);
return 0;
}
@@ -285,12 +288,8 @@ static bool cls_bpf_destroy(struct tcf_proto *tp, bool force)
if (!force && !list_empty(&head->plist))
return false;
- list_for_each_entry_safe(prog, tmp, &head->plist, link) {
- cls_bpf_stop_offload(tp, prog);
- list_del_rcu(&prog->link);
- tcf_unbind_filter(tp, &prog->res);
- call_rcu(&prog->rcu, __cls_bpf_delete_prog);
- }
+ list_for_each_entry_safe(prog, tmp, &head->plist, link)
+ __cls_bpf_delete(tp, prog);
kfree_rcu(head, rcu);
return true;
@@ -365,9 +364,7 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
return PTR_ERR(fp);
if (tb[TCA_BPF_NAME]) {
- name = kmemdup(nla_data(tb[TCA_BPF_NAME]),
- nla_len(tb[TCA_BPF_NAME]),
- GFP_KERNEL);
+ name = nla_memdup(tb[TCA_BPF_NAME], GFP_KERNEL);
if (!name) {
bpf_prog_put(fp);
return -ENOMEM;
@@ -375,7 +372,6 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
}
prog->bpf_ops = NULL;
- prog->bpf_fd = bpf_fd;
prog->bpf_name = name;
prog->filter = fp;
@@ -517,14 +513,17 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
ret = cls_bpf_offload(tp, prog, oldprog);
if (ret) {
- cls_bpf_delete_prog(tp, prog);
+ __cls_bpf_delete_prog(prog);
return ret;
}
+ if (!tc_in_hw(prog->gen_flags))
+ prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
if (oldprog) {
list_replace_rcu(&oldprog->link, &prog->link);
tcf_unbind_filter(tp, &oldprog->res);
- call_rcu(&oldprog->rcu, __cls_bpf_delete_prog);
+ call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu);
} else {
list_add_rcu(&prog->link, &head->plist);
}
@@ -559,13 +558,18 @@ static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog,
static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog,
struct sk_buff *skb)
{
- if (nla_put_u32(skb, TCA_BPF_FD, prog->bpf_fd))
- return -EMSGSIZE;
+ struct nlattr *nla;
if (prog->bpf_name &&
nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name))
return -EMSGSIZE;
+ nla = nla_reserve(skb, TCA_BPF_TAG, sizeof(prog->filter->tag));
+ if (nla == NULL)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(nla), prog->filter->tag, nla_len(nla));
+
return 0;
}
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 6575aba87630..3d6b9286c203 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -129,7 +129,7 @@ static u32 flow_get_mark(const struct sk_buff *skb)
static u32 flow_get_nfct(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- return addr_fold(skb->nfct);
+ return addr_fold(skb_nfct(skb));
#else
return 0;
#endif
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 904442421db3..9d0c99d2e9fb 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -39,11 +39,14 @@ struct fl_flow_key {
struct flow_dissector_key_ipv6_addrs ipv6;
};
struct flow_dissector_key_ports tp;
+ struct flow_dissector_key_icmp icmp;
+ struct flow_dissector_key_arp arp;
struct flow_dissector_key_keyid enc_key_id;
union {
struct flow_dissector_key_ipv4_addrs enc_ipv4;
struct flow_dissector_key_ipv6_addrs enc_ipv6;
};
+ struct flow_dissector_key_ports enc_tp;
} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
struct fl_flow_mask_range {
@@ -81,6 +84,8 @@ struct cls_fl_filter {
u32 handle;
u32 flags;
struct rcu_head rcu;
+ struct tc_to_netdev tc;
+ struct net_device *hw_dev;
};
static unsigned short int fl_mask_range(const struct fl_flow_mask *mask)
@@ -129,6 +134,14 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
}
+static struct cls_fl_filter *fl_lookup(struct cls_fl_head *head,
+ struct fl_flow_key *mkey)
+{
+ return rhashtable_lookup_fast(&head->ht,
+ fl_key_get_start(mkey, &head->mask),
+ head->ht_params);
+}
+
static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
@@ -149,16 +162,22 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
switch (ip_tunnel_info_af(info)) {
case AF_INET:
+ skb_key.enc_control.addr_type =
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS;
skb_key.enc_ipv4.src = key->u.ipv4.src;
skb_key.enc_ipv4.dst = key->u.ipv4.dst;
break;
case AF_INET6:
+ skb_key.enc_control.addr_type =
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS;
skb_key.enc_ipv6.src = key->u.ipv6.src;
skb_key.enc_ipv6.dst = key->u.ipv6.dst;
break;
}
skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id);
+ skb_key.enc_tp.src = key->tp_src;
+ skb_key.enc_tp.dst = key->tp_dst;
}
skb_key.indev_ifindex = skb->skb_iif;
@@ -170,9 +189,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
fl_set_masked_key(&skb_mkey, &skb_key, &head->mask);
- f = rhashtable_lookup_fast(&head->ht,
- fl_key_get_start(&skb_mkey, &head->mask),
- head->ht_params);
+ f = fl_lookup(head, &skb_mkey);
if (f && !tc_skip_sw(f->flags)) {
*res = f->res;
return tcf_exts_exec(skb, &f->exts, res);
@@ -202,75 +219,95 @@ static void fl_destroy_filter(struct rcu_head *head)
kfree(f);
}
-static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
+static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
{
- struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_flower_offload offload = {0};
- struct tc_to_netdev tc;
+ struct net_device *dev = f->hw_dev;
+ struct tc_to_netdev *tc = &f->tc;
- if (!tc_should_offload(dev, tp, 0))
+ if (!tc_can_offload(dev, tp))
return;
offload.command = TC_CLSFLOWER_DESTROY;
- offload.cookie = cookie;
+ offload.prio = tp->prio;
+ offload.cookie = (unsigned long)f;
- tc.type = TC_SETUP_CLSFLOWER;
- tc.cls_flower = &offload;
+ tc->type = TC_SETUP_CLSFLOWER;
+ tc->cls_flower = &offload;
- dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, tc);
}
static int fl_hw_replace_filter(struct tcf_proto *tp,
struct flow_dissector *dissector,
struct fl_flow_key *mask,
- struct fl_flow_key *key,
- struct tcf_exts *actions,
- unsigned long cookie, u32 flags)
+ struct cls_fl_filter *f)
{
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_flower_offload offload = {0};
- struct tc_to_netdev tc;
+ struct tc_to_netdev *tc = &f->tc;
int err;
- if (!tc_should_offload(dev, tp, flags))
- return tc_skip_sw(flags) ? -EINVAL : 0;
+ if (!tc_can_offload(dev, tp)) {
+ if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev) ||
+ (f->hw_dev && !tc_can_offload(f->hw_dev, tp))) {
+ f->hw_dev = dev;
+ return tc_skip_sw(f->flags) ? -EINVAL : 0;
+ }
+ dev = f->hw_dev;
+ tc->egress_dev = true;
+ } else {
+ f->hw_dev = dev;
+ }
offload.command = TC_CLSFLOWER_REPLACE;
- offload.cookie = cookie;
+ offload.prio = tp->prio;
+ offload.cookie = (unsigned long)f;
offload.dissector = dissector;
offload.mask = mask;
- offload.key = key;
- offload.exts = actions;
+ offload.key = &f->mkey;
+ offload.exts = &f->exts;
- tc.type = TC_SETUP_CLSFLOWER;
- tc.cls_flower = &offload;
+ tc->type = TC_SETUP_CLSFLOWER;
+ tc->cls_flower = &offload;
err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
- &tc);
+ tc);
+ if (!err)
+ f->flags |= TCA_CLS_FLAGS_IN_HW;
- if (tc_skip_sw(flags))
+ if (tc_skip_sw(f->flags))
return err;
-
return 0;
}
static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
{
- struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_flower_offload offload = {0};
- struct tc_to_netdev tc;
+ struct net_device *dev = f->hw_dev;
+ struct tc_to_netdev *tc = &f->tc;
- if (!tc_should_offload(dev, tp, 0))
+ if (!tc_can_offload(dev, tp))
return;
offload.command = TC_CLSFLOWER_STATS;
+ offload.prio = tp->prio;
offload.cookie = (unsigned long)f;
offload.exts = &f->exts;
- tc.type = TC_SETUP_CLSFLOWER;
- tc.cls_flower = &offload;
+ tc->type = TC_SETUP_CLSFLOWER;
+ tc->cls_flower = &offload;
+
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, tc);
+}
- dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
+{
+ list_del_rcu(&f->list);
+ if (!tc_skip_hw(f->flags))
+ fl_hw_destroy_filter(tp, f);
+ tcf_unbind_filter(tp, &f->res);
+ call_rcu(&f->rcu, fl_destroy_filter);
}
static void fl_destroy_sleepable(struct work_struct *work)
@@ -299,14 +336,12 @@ static bool fl_destroy(struct tcf_proto *tp, bool force)
if (!force && !list_empty(&head->filters))
return false;
- list_for_each_entry_safe(f, next, &head->filters, list) {
- fl_hw_destroy_filter(tp, (unsigned long)f);
- list_del_rcu(&f->list);
- call_rcu(&f->rcu, fl_destroy_filter);
- }
+ list_for_each_entry_safe(f, next, &head->filters, list)
+ __fl_delete(tp, f);
__module_get(THIS_MODULE);
call_rcu(&head->rcu, fl_destroy_rcu);
+
return true;
}
@@ -360,6 +395,34 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
[TCA_FLOWER_KEY_TCP_DST_MASK] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_UDP_SRC_MASK] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_UDP_DST_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_SCTP_SRC_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_SCTP_DST_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_SCTP_SRC] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_SCTP_DST] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_UDP_SRC_PORT] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_UDP_DST_PORT] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_FLAGS] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_FLAGS_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ICMPV4_TYPE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV4_TYPE_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV4_CODE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV4_CODE_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV6_TYPE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV6_TYPE_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV6_CODE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV6_CODE_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ARP_SIP] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ARP_SIP_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ARP_TIP] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ARP_TIP_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ARP_OP] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ARP_OP_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ARP_SHA] = { .len = ETH_ALEN },
+ [TCA_FLOWER_KEY_ARP_SHA_MASK] = { .len = ETH_ALEN },
+ [TCA_FLOWER_KEY_ARP_THA] = { .len = ETH_ALEN },
+ [TCA_FLOWER_KEY_ARP_THA_MASK] = { .len = ETH_ALEN },
};
static void fl_set_key_val(struct nlattr **tb,
@@ -394,10 +457,43 @@ static void fl_set_key_vlan(struct nlattr **tb,
}
}
+static void fl_set_key_flag(u32 flower_key, u32 flower_mask,
+ u32 *dissector_key, u32 *dissector_mask,
+ u32 flower_flag_bit, u32 dissector_flag_bit)
+{
+ if (flower_mask & flower_flag_bit) {
+ *dissector_mask |= dissector_flag_bit;
+ if (flower_key & flower_flag_bit)
+ *dissector_key |= dissector_flag_bit;
+ }
+}
+
+static int fl_set_key_flags(struct nlattr **tb,
+ u32 *flags_key, u32 *flags_mask)
+{
+ u32 key, mask;
+
+ /* mask is mandatory for flags */
+ if (!tb[TCA_FLOWER_KEY_FLAGS_MASK])
+ return -EINVAL;
+
+ key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS]));
+ mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
+
+ *flags_key = 0;
+ *flags_mask = 0;
+
+ fl_set_key_flag(key, mask, flags_key, flags_mask,
+ TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT);
+
+ return 0;
+}
+
static int fl_set_key(struct net *net, struct nlattr **tb,
struct fl_flow_key *key, struct fl_flow_key *mask)
{
__be16 ethertype;
+ int ret = 0;
#ifdef CONFIG_NET_CLS_IND
if (tb[TCA_FLOWER_INDEV]) {
int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]);
@@ -439,6 +535,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) {
key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ mask->control.addr_type = ~0;
fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC,
&mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK,
sizeof(key->ipv4.src));
@@ -447,6 +544,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
sizeof(key->ipv4.dst));
} else if (tb[TCA_FLOWER_KEY_IPV6_SRC] || tb[TCA_FLOWER_KEY_IPV6_DST]) {
key->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ mask->control.addr_type = ~0;
fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC,
&mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK,
sizeof(key->ipv6.src));
@@ -469,11 +567,56 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
&mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
sizeof(key->tp.dst));
+ } else if (key->basic.ip_proto == IPPROTO_SCTP) {
+ fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_SCTP_SRC,
+ &mask->tp.src, TCA_FLOWER_KEY_SCTP_SRC_MASK,
+ sizeof(key->tp.src));
+ fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_SCTP_DST,
+ &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK,
+ sizeof(key->tp.dst));
+ } else if (key->basic.n_proto == htons(ETH_P_IP) &&
+ key->basic.ip_proto == IPPROTO_ICMP) {
+ fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV4_TYPE,
+ &mask->icmp.type,
+ TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,
+ sizeof(key->icmp.type));
+ fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE,
+ &mask->icmp.code,
+ TCA_FLOWER_KEY_ICMPV4_CODE_MASK,
+ sizeof(key->icmp.code));
+ } else if (key->basic.n_proto == htons(ETH_P_IPV6) &&
+ key->basic.ip_proto == IPPROTO_ICMPV6) {
+ fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV6_TYPE,
+ &mask->icmp.type,
+ TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,
+ sizeof(key->icmp.type));
+ fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV6_CODE,
+ &mask->icmp.code,
+ TCA_FLOWER_KEY_ICMPV6_CODE_MASK,
+ sizeof(key->icmp.code));
+ } else if (key->basic.n_proto == htons(ETH_P_ARP) ||
+ key->basic.n_proto == htons(ETH_P_RARP)) {
+ fl_set_key_val(tb, &key->arp.sip, TCA_FLOWER_KEY_ARP_SIP,
+ &mask->arp.sip, TCA_FLOWER_KEY_ARP_SIP_MASK,
+ sizeof(key->arp.sip));
+ fl_set_key_val(tb, &key->arp.tip, TCA_FLOWER_KEY_ARP_TIP,
+ &mask->arp.tip, TCA_FLOWER_KEY_ARP_TIP_MASK,
+ sizeof(key->arp.tip));
+ fl_set_key_val(tb, &key->arp.op, TCA_FLOWER_KEY_ARP_OP,
+ &mask->arp.op, TCA_FLOWER_KEY_ARP_OP_MASK,
+ sizeof(key->arp.op));
+ fl_set_key_val(tb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA,
+ mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK,
+ sizeof(key->arp.sha));
+ fl_set_key_val(tb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA,
+ mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
+ sizeof(key->arp.tha));
}
if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) {
key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ mask->enc_control.addr_type = ~0;
fl_set_key_val(tb, &key->enc_ipv4.src,
TCA_FLOWER_KEY_ENC_IPV4_SRC,
&mask->enc_ipv4.src,
@@ -489,6 +632,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
if (tb[TCA_FLOWER_KEY_ENC_IPV6_SRC] ||
tb[TCA_FLOWER_KEY_ENC_IPV6_DST]) {
key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ mask->enc_control.addr_type = ~0;
fl_set_key_val(tb, &key->enc_ipv6.src,
TCA_FLOWER_KEY_ENC_IPV6_SRC,
&mask->enc_ipv6.src,
@@ -505,7 +649,18 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
&mask->enc_key_id.keyid, TCA_FLOWER_UNSPEC,
sizeof(key->enc_key_id.keyid));
- return 0;
+ fl_set_key_val(tb, &key->enc_tp.src, TCA_FLOWER_KEY_ENC_UDP_SRC_PORT,
+ &mask->enc_tp.src, TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK,
+ sizeof(key->enc_tp.src));
+
+ fl_set_key_val(tb, &key->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT,
+ &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
+ sizeof(key->enc_tp.dst));
+
+ if (tb[TCA_FLOWER_KEY_FLAGS])
+ ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
+
+ return ret;
}
static bool fl_mask_eq(struct fl_flow_mask *mask1,
@@ -571,7 +726,23 @@ static void fl_init_dissector(struct cls_fl_head *head,
FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
FLOW_DISSECTOR_KEY_PORTS, tp);
FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_ICMP, icmp);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_ARP, arp);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
FLOW_DISSECTOR_KEY_VLAN, vlan);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, enc_ipv4);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, enc_ipv6);
+ if (FL_KEY_IS_MASKED(&mask->key, enc_ipv4) ||
+ FL_KEY_IS_MASKED(&mask->key, enc_ipv6))
+ FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_ENC_CONTROL,
+ enc_control);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp);
skb_flow_dissector_init(&head->dissector, keys, cnt);
}
@@ -666,23 +837,31 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
struct cls_fl_head *head = rtnl_dereference(tp->root);
struct cls_fl_filter *fold = (struct cls_fl_filter *) *arg;
struct cls_fl_filter *fnew;
- struct nlattr *tb[TCA_FLOWER_MAX + 1];
+ struct nlattr **tb;
struct fl_flow_mask mask = {};
int err;
if (!tca[TCA_OPTIONS])
return -EINVAL;
+ tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL);
+ if (!tb)
+ return -ENOBUFS;
+
err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], fl_policy);
if (err < 0)
- return err;
+ goto errout_tb;
- if (fold && handle && fold->handle != handle)
- return -EINVAL;
+ if (fold && handle && fold->handle != handle) {
+ err = -EINVAL;
+ goto errout_tb;
+ }
fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
- if (!fnew)
- return -ENOBUFS;
+ if (!fnew) {
+ err = -ENOBUFS;
+ goto errout_tb;
+ }
err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0);
if (err < 0)
@@ -715,27 +894,35 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
goto errout;
if (!tc_skip_sw(fnew->flags)) {
+ if (!fold && fl_lookup(head, &fnew->mkey)) {
+ err = -EEXIST;
+ goto errout;
+ }
+
err = rhashtable_insert_fast(&head->ht, &fnew->ht_node,
head->ht_params);
if (err)
goto errout;
}
- err = fl_hw_replace_filter(tp,
- &head->dissector,
- &mask.key,
- &fnew->key,
- &fnew->exts,
- (unsigned long)fnew,
- fnew->flags);
- if (err)
- goto errout;
+ if (!tc_skip_hw(fnew->flags)) {
+ err = fl_hw_replace_filter(tp,
+ &head->dissector,
+ &mask.key,
+ fnew);
+ if (err)
+ goto errout;
+ }
+
+ if (!tc_in_hw(fnew->flags))
+ fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
if (fold) {
if (!tc_skip_sw(fold->flags))
rhashtable_remove_fast(&head->ht, &fold->ht_node,
head->ht_params);
- fl_hw_destroy_filter(tp, (unsigned long)fold);
+ if (!tc_skip_hw(fold->flags))
+ fl_hw_destroy_filter(tp, fold);
}
*arg = (unsigned long) fnew;
@@ -748,11 +935,14 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
list_add_tail_rcu(&fnew->list, &head->filters);
}
+ kfree(tb);
return 0;
errout:
tcf_exts_destroy(&fnew->exts);
kfree(fnew);
+errout_tb:
+ kfree(tb);
return err;
}
@@ -764,10 +954,7 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg)
if (!tc_skip_sw(f->flags))
rhashtable_remove_fast(&head->ht, &f->ht_node,
head->ht_params);
- list_del_rcu(&f->list);
- fl_hw_destroy_filter(tp, (unsigned long)f);
- tcf_unbind_filter(tp, &f->res);
- call_rcu(&f->rcu, fl_destroy_filter);
+ __fl_delete(tp, f);
return 0;
}
@@ -830,6 +1017,42 @@ static int fl_dump_key_vlan(struct sk_buff *skb,
return 0;
}
+static void fl_get_key_flag(u32 dissector_key, u32 dissector_mask,
+ u32 *flower_key, u32 *flower_mask,
+ u32 flower_flag_bit, u32 dissector_flag_bit)
+{
+ if (dissector_mask & dissector_flag_bit) {
+ *flower_mask |= flower_flag_bit;
+ if (dissector_key & dissector_flag_bit)
+ *flower_key |= flower_flag_bit;
+ }
+}
+
+static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask)
+{
+ u32 key, mask;
+ __be32 _key, _mask;
+ int err;
+
+ if (!memchr_inv(&flags_mask, 0, sizeof(flags_mask)))
+ return 0;
+
+ key = 0;
+ mask = 0;
+
+ fl_get_key_flag(flags_key, flags_mask, &key, &mask,
+ TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT);
+
+ _key = cpu_to_be32(key);
+ _mask = cpu_to_be32(mask);
+
+ err = nla_put(skb, TCA_FLOWER_KEY_FLAGS, 4, &_key);
+ if (err)
+ return err;
+
+ return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask);
+}
+
static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
@@ -862,7 +1085,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
goto nla_put_failure;
}
- fl_hw_update_stats(tp, f);
+ if (!tc_skip_hw(f->flags))
+ fl_hw_update_stats(tp, f);
if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
@@ -918,6 +1142,57 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
&mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
sizeof(key->tp.dst))))
goto nla_put_failure;
+ else if (key->basic.ip_proto == IPPROTO_SCTP &&
+ (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_SCTP_SRC,
+ &mask->tp.src, TCA_FLOWER_KEY_SCTP_SRC_MASK,
+ sizeof(key->tp.src)) ||
+ fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_SCTP_DST,
+ &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK,
+ sizeof(key->tp.dst))))
+ goto nla_put_failure;
+ else if (key->basic.n_proto == htons(ETH_P_IP) &&
+ key->basic.ip_proto == IPPROTO_ICMP &&
+ (fl_dump_key_val(skb, &key->icmp.type,
+ TCA_FLOWER_KEY_ICMPV4_TYPE, &mask->icmp.type,
+ TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,
+ sizeof(key->icmp.type)) ||
+ fl_dump_key_val(skb, &key->icmp.code,
+ TCA_FLOWER_KEY_ICMPV4_CODE, &mask->icmp.code,
+ TCA_FLOWER_KEY_ICMPV4_CODE_MASK,
+ sizeof(key->icmp.code))))
+ goto nla_put_failure;
+ else if (key->basic.n_proto == htons(ETH_P_IPV6) &&
+ key->basic.ip_proto == IPPROTO_ICMPV6 &&
+ (fl_dump_key_val(skb, &key->icmp.type,
+ TCA_FLOWER_KEY_ICMPV6_TYPE, &mask->icmp.type,
+ TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,
+ sizeof(key->icmp.type)) ||
+ fl_dump_key_val(skb, &key->icmp.code,
+ TCA_FLOWER_KEY_ICMPV6_CODE, &mask->icmp.code,
+ TCA_FLOWER_KEY_ICMPV6_CODE_MASK,
+ sizeof(key->icmp.code))))
+ goto nla_put_failure;
+ else if ((key->basic.n_proto == htons(ETH_P_ARP) ||
+ key->basic.n_proto == htons(ETH_P_RARP)) &&
+ (fl_dump_key_val(skb, &key->arp.sip,
+ TCA_FLOWER_KEY_ARP_SIP, &mask->arp.sip,
+ TCA_FLOWER_KEY_ARP_SIP_MASK,
+ sizeof(key->arp.sip)) ||
+ fl_dump_key_val(skb, &key->arp.tip,
+ TCA_FLOWER_KEY_ARP_TIP, &mask->arp.tip,
+ TCA_FLOWER_KEY_ARP_TIP_MASK,
+ sizeof(key->arp.tip)) ||
+ fl_dump_key_val(skb, &key->arp.op,
+ TCA_FLOWER_KEY_ARP_OP, &mask->arp.op,
+ TCA_FLOWER_KEY_ARP_OP_MASK,
+ sizeof(key->arp.op)) ||
+ fl_dump_key_val(skb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA,
+ mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK,
+ sizeof(key->arp.sha)) ||
+ fl_dump_key_val(skb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA,
+ mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
+ sizeof(key->arp.tha))))
+ goto nla_put_failure;
if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
(fl_dump_key_val(skb, &key->enc_ipv4.src,
@@ -943,10 +1218,24 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
if (fl_dump_key_val(skb, &key->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID,
&mask->enc_key_id, TCA_FLOWER_UNSPEC,
- sizeof(key->enc_key_id)))
+ sizeof(key->enc_key_id)) ||
+ fl_dump_key_val(skb, &key->enc_tp.src,
+ TCA_FLOWER_KEY_ENC_UDP_SRC_PORT,
+ &mask->enc_tp.src,
+ TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK,
+ sizeof(key->enc_tp.src)) ||
+ fl_dump_key_val(skb, &key->enc_tp.dst,
+ TCA_FLOWER_KEY_ENC_UDP_DST_PORT,
+ &mask->enc_tp.dst,
+ TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
+ sizeof(key->enc_tp.dst)))
goto nla_put_failure;
- nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags);
+ if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
+ goto nla_put_failure;
+
+ if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
+ goto nla_put_failure;
if (tcf_exts_dump(skb, &f->exts))
goto nla_put_failure;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index f935429bd5ef..224eb2c14346 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -16,16 +16,11 @@
#include <net/sch_generic.h>
#include <net/pkt_cls.h>
-struct cls_mall_filter {
+struct cls_mall_head {
struct tcf_exts exts;
struct tcf_result res;
u32 handle;
- struct rcu_head rcu;
u32 flags;
-};
-
-struct cls_mall_head {
- struct cls_mall_filter *filter;
struct rcu_head rcu;
};
@@ -33,56 +28,52 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct cls_mall_head *head = rcu_dereference_bh(tp->root);
- struct cls_mall_filter *f = head->filter;
- if (tc_skip_sw(f->flags))
+ if (tc_skip_sw(head->flags))
return -1;
- return tcf_exts_exec(skb, &f->exts, res);
+ return tcf_exts_exec(skb, &head->exts, res);
}
static int mall_init(struct tcf_proto *tp)
{
- struct cls_mall_head *head;
-
- head = kzalloc(sizeof(*head), GFP_KERNEL);
- if (!head)
- return -ENOBUFS;
-
- rcu_assign_pointer(tp->root, head);
-
return 0;
}
-static void mall_destroy_filter(struct rcu_head *head)
+static void mall_destroy_rcu(struct rcu_head *rcu)
{
- struct cls_mall_filter *f = container_of(head, struct cls_mall_filter, rcu);
+ struct cls_mall_head *head = container_of(rcu, struct cls_mall_head,
+ rcu);
- tcf_exts_destroy(&f->exts);
-
- kfree(f);
+ tcf_exts_destroy(&head->exts);
+ kfree(head);
}
static int mall_replace_hw_filter(struct tcf_proto *tp,
- struct cls_mall_filter *f,
+ struct cls_mall_head *head,
unsigned long cookie)
{
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_to_netdev offload;
struct tc_cls_matchall_offload mall_offload = {0};
+ int err;
offload.type = TC_SETUP_MATCHALL;
offload.cls_mall = &mall_offload;
offload.cls_mall->command = TC_CLSMATCHALL_REPLACE;
- offload.cls_mall->exts = &f->exts;
+ offload.cls_mall->exts = &head->exts;
offload.cls_mall->cookie = cookie;
- return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
- &offload);
+ err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
+ &offload);
+ if (!err)
+ head->flags |= TCA_CLS_FLAGS_IN_HW;
+
+ return err;
}
static void mall_destroy_hw_filter(struct tcf_proto *tp,
- struct cls_mall_filter *f,
+ struct cls_mall_head *head,
unsigned long cookie)
{
struct net_device *dev = tp->q->dev_queue->dev;
@@ -103,29 +94,20 @@ static bool mall_destroy(struct tcf_proto *tp, bool force)
{
struct cls_mall_head *head = rtnl_dereference(tp->root);
struct net_device *dev = tp->q->dev_queue->dev;
- struct cls_mall_filter *f = head->filter;
- if (!force && f)
- return false;
+ if (!head)
+ return true;
- if (f) {
- if (tc_should_offload(dev, tp, f->flags))
- mall_destroy_hw_filter(tp, f, (unsigned long) f);
+ if (tc_should_offload(dev, tp, head->flags))
+ mall_destroy_hw_filter(tp, head, (unsigned long) head);
- call_rcu(&f->rcu, mall_destroy_filter);
- }
- kfree_rcu(head, rcu);
+ call_rcu(&head->rcu, mall_destroy_rcu);
return true;
}
static unsigned long mall_get(struct tcf_proto *tp, u32 handle)
{
- struct cls_mall_head *head = rtnl_dereference(tp->root);
- struct cls_mall_filter *f = head->filter;
-
- if (f && f->handle == handle)
- return (unsigned long) f;
- return 0;
+ return 0UL;
}
static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
@@ -134,26 +116,31 @@ static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
};
static int mall_set_parms(struct net *net, struct tcf_proto *tp,
- struct cls_mall_filter *f,
+ struct cls_mall_head *head,
unsigned long base, struct nlattr **tb,
struct nlattr *est, bool ovr)
{
struct tcf_exts e;
int err;
- tcf_exts_init(&e, TCA_MATCHALL_ACT, 0);
+ err = tcf_exts_init(&e, TCA_MATCHALL_ACT, 0);
+ if (err)
+ return err;
err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
if (err < 0)
- return err;
+ goto errout;
if (tb[TCA_MATCHALL_CLASSID]) {
- f->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]);
- tcf_bind_filter(tp, &f->res, base);
+ head->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]);
+ tcf_bind_filter(tp, &head->res, base);
}
- tcf_exts_change(tp, &f->exts, &e);
+ tcf_exts_change(tp, &head->exts, &e);
return 0;
+errout:
+ tcf_exts_destroy(&e);
+ return err;
}
static int mall_change(struct net *net, struct sk_buff *in_skb,
@@ -162,21 +149,17 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
unsigned long *arg, bool ovr)
{
struct cls_mall_head *head = rtnl_dereference(tp->root);
- struct cls_mall_filter *fold = (struct cls_mall_filter *) *arg;
struct net_device *dev = tp->q->dev_queue->dev;
- struct cls_mall_filter *f;
struct nlattr *tb[TCA_MATCHALL_MAX + 1];
+ struct cls_mall_head *new;
u32 flags = 0;
int err;
if (!tca[TCA_OPTIONS])
return -EINVAL;
- if (head->filter)
- return -EBUSY;
-
- if (fold)
- return -EINVAL;
+ if (head)
+ return -EEXIST;
err = nla_parse_nested(tb, TCA_MATCHALL_MAX,
tca[TCA_OPTIONS], mall_policy);
@@ -189,64 +172,62 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
return -EINVAL;
}
- f = kzalloc(sizeof(*f), GFP_KERNEL);
- if (!f)
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
return -ENOBUFS;
- tcf_exts_init(&f->exts, TCA_MATCHALL_ACT, 0);
+ err = tcf_exts_init(&new->exts, TCA_MATCHALL_ACT, 0);
+ if (err)
+ goto err_exts_init;
if (!handle)
handle = 1;
- f->handle = handle;
- f->flags = flags;
+ new->handle = handle;
+ new->flags = flags;
- err = mall_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr);
+ err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr);
if (err)
- goto errout;
+ goto err_set_parms;
if (tc_should_offload(dev, tp, flags)) {
- err = mall_replace_hw_filter(tp, f, (unsigned long) f);
+ err = mall_replace_hw_filter(tp, new, (unsigned long) new);
if (err) {
if (tc_skip_sw(flags))
- goto errout;
+ goto err_replace_hw_filter;
else
err = 0;
}
}
- *arg = (unsigned long) f;
- rcu_assign_pointer(head->filter, f);
+ if (!tc_in_hw(new->flags))
+ new->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+ *arg = (unsigned long) head;
+ rcu_assign_pointer(tp->root, new);
+ if (head)
+ call_rcu(&head->rcu, mall_destroy_rcu);
return 0;
-errout:
- kfree(f);
+err_replace_hw_filter:
+err_set_parms:
+ tcf_exts_destroy(&new->exts);
+err_exts_init:
+ kfree(new);
return err;
}
static int mall_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct cls_mall_head *head = rtnl_dereference(tp->root);
- struct cls_mall_filter *f = (struct cls_mall_filter *) arg;
- struct net_device *dev = tp->q->dev_queue->dev;
-
- if (tc_should_offload(dev, tp, f->flags))
- mall_destroy_hw_filter(tp, f, (unsigned long) f);
-
- RCU_INIT_POINTER(head->filter, NULL);
- tcf_unbind_filter(tp, &f->res);
- call_rcu(&f->rcu, mall_destroy_filter);
- return 0;
+ return -EOPNOTSUPP;
}
static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct cls_mall_head *head = rtnl_dereference(tp->root);
- struct cls_mall_filter *f = head->filter;
if (arg->count < arg->skip)
goto skip;
- if (arg->fn(tp, (unsigned long) f, arg) < 0)
+ if (arg->fn(tp, (unsigned long) head, arg) < 0)
arg->stop = 1;
skip:
arg->count++;
@@ -255,28 +236,31 @@ skip:
static int mall_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
- struct cls_mall_filter *f = (struct cls_mall_filter *) fh;
+ struct cls_mall_head *head = (struct cls_mall_head *) fh;
struct nlattr *nest;
- if (!f)
+ if (!head)
return skb->len;
- t->tcm_handle = f->handle;
+ t->tcm_handle = head->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (!nest)
goto nla_put_failure;
- if (f->res.classid &&
- nla_put_u32(skb, TCA_MATCHALL_CLASSID, f->res.classid))
+ if (head->res.classid &&
+ nla_put_u32(skb, TCA_MATCHALL_CLASSID, head->res.classid))
+ goto nla_put_failure;
+
+ if (head->flags && nla_put_u32(skb, TCA_MATCHALL_FLAGS, head->flags))
goto nla_put_failure;
- if (tcf_exts_dump(skb, &f->exts))
+ if (tcf_exts_dump(skb, &head->exts))
goto nla_put_failure;
nla_nest_end(skb, nest);
- if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+ if (tcf_exts_dump_stats(skb, &head->exts) < 0)
goto nla_put_failure;
return skb->len;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index ae83c3aec308..4dbe0c680fe6 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -334,7 +334,6 @@ static int u32_init(struct tcf_proto *tp)
if (root_ht == NULL)
return -ENOBUFS;
- root_ht->divisor = 0;
root_ht->refcnt++;
root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
root_ht->prio = tp->prio;
@@ -524,6 +523,10 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
tp->protocol, &offload);
+
+ if (!err)
+ n->flags |= TCA_CLS_FLAGS_IN_HW;
+
if (tc_skip_sw(flags))
return err;
@@ -896,6 +899,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
return err;
}
+ if (!tc_in_hw(new->flags))
+ new->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
u32_replace_knode(tp, tp_c, new);
tcf_unbind_filter(tp, &n->res);
call_rcu(&n->rcu, u32_delete_key_rcu);
@@ -1015,6 +1021,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
if (err)
goto errhw;
+ if (!tc_in_hw(n->flags))
+ n->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+
ins = &ht->ht[TC_U32_HASH(handle)];
for (pins = rtnl_dereference(*ins); pins;
ins = &pins->next, pins = rtnl_dereference(*ins))
diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c
index c66ca9400ab4..c1b23e3060b8 100644
--- a/net/sched/em_ipset.c
+++ b/net/sched/em_ipset.c
@@ -57,17 +57,20 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
struct xt_action_param acpar;
const struct xt_set_info *set = (const void *) em->data;
struct net_device *dev, *indev = NULL;
+ struct nf_hook_state state = {
+ .net = em->net,
+ };
int ret, network_offset;
switch (tc_skb_protocol(skb)) {
case htons(ETH_P_IP):
- acpar.family = NFPROTO_IPV4;
+ state.pf = NFPROTO_IPV4;
if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
return 0;
acpar.thoff = ip_hdrlen(skb);
break;
case htons(ETH_P_IPV6):
- acpar.family = NFPROTO_IPV6;
+ state.pf = NFPROTO_IPV6;
if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
return 0;
/* doesn't call ipv6_find_hdr() because ipset doesn't use thoff, yet */
@@ -77,9 +80,7 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
return 0;
}
- acpar.hooknum = 0;
-
- opt.family = acpar.family;
+ opt.family = state.pf;
opt.dim = set->dim;
opt.flags = set->flags;
opt.cmdflags = 0;
@@ -95,9 +96,9 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
if (skb->skb_iif)
indev = dev_get_by_index_rcu(em->net, skb->skb_iif);
- acpar.net = em->net;
- acpar.in = indev ? indev : dev;
- acpar.out = dev;
+ state.in = indev ? indev : dev;
+ state.out = dev;
+ acpar.state = &state;
ret = ip_set_test(set->index, skb, &acpar, &opt);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index a309a07ccb35..ae7e4f5b348b 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -63,6 +63,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/sched/loadavg.h>
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/random.h>
@@ -176,11 +177,12 @@ META_COLLECTOR(int_vlan_tag)
{
unsigned short tag;
- tag = skb_vlan_tag_get(skb);
- if (!tag && __vlan_get_tag(skb, &tag))
- *err = -1;
- else
+ if (skb_vlan_tag_present(skb))
+ dst->value = skb_vlan_tag_get(skb);
+ else if (!__vlan_get_tag(skb, &tag))
dst->value = tag;
+ else
+ *err = -1;
}
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 206dc24add3a..bcf49cd22786 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -440,7 +440,6 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab)
EXPORT_SYMBOL(qdisc_put_rtab);
static LIST_HEAD(qdisc_stab_list);
-static DEFINE_SPINLOCK(qdisc_stab_lock);
static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
[TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
@@ -474,20 +473,15 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
if (tsize != s->tsize || (!tab && tsize > 0))
return ERR_PTR(-EINVAL);
- spin_lock(&qdisc_stab_lock);
-
list_for_each_entry(stab, &qdisc_stab_list, list) {
if (memcmp(&stab->szopts, s, sizeof(*s)))
continue;
if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
continue;
stab->refcnt++;
- spin_unlock(&qdisc_stab_lock);
return stab;
}
- spin_unlock(&qdisc_stab_lock);
-
stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
if (!stab)
return ERR_PTR(-ENOMEM);
@@ -497,9 +491,7 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
if (tsize > 0)
memcpy(stab->data, tab, tsize * sizeof(u16));
- spin_lock(&qdisc_stab_lock);
list_add_tail(&stab->list, &qdisc_stab_list);
- spin_unlock(&qdisc_stab_lock);
return stab;
}
@@ -514,14 +506,10 @@ void qdisc_put_stab(struct qdisc_size_table *tab)
if (!tab)
return;
- spin_lock(&qdisc_stab_lock);
-
if (--tab->refcnt == 0) {
list_del(&tab->list);
call_rcu_bh(&tab->rcu, stab_kfree_rcu);
}
-
- spin_unlock(&qdisc_stab_lock);
}
EXPORT_SYMBOL(qdisc_put_stab);
@@ -960,6 +948,17 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
sch->handle = handle;
+ /* This exist to keep backward compatible with a userspace
+ * loophole, what allowed userspace to get IFF_NO_QUEUE
+ * facility on older kernels by setting tx_queue_len=0 (prior
+ * to qdisc init), and then forgot to reinit tx_queue_len
+ * before again attaching a qdisc.
+ */
+ if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
+ dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
+ netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
+ }
+
if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
if (qdisc_is_percpu_stats(sch)) {
sch->cpu_bstats =
@@ -1008,6 +1007,8 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
return sch;
}
+ /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
+ ops->destroy(sch);
err_out3:
dev_put(dev);
kfree((char *) sch - sch->padded);
@@ -1384,7 +1385,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
&d, cpu_bstats, &q->bstats) < 0 ||
- gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
goto nla_put_failure;
@@ -1850,6 +1851,7 @@ int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
{
__be16 protocol = tc_skb_protocol(skb);
#ifdef CONFIG_NET_CLS_ACT
+ const int max_reclassify_loop = 4;
const struct tcf_proto *old_tp = tp;
int limit = 0;
@@ -1874,7 +1876,7 @@ reclassify:
return TC_ACT_UNSPEC; /* signal: continue lookup */
#ifdef CONFIG_NET_CLS_ACT
reset:
- if (unlikely(limit++ >= MAX_REC_LOOP)) {
+ if (unlikely(limit++ >= max_reclassify_loop)) {
net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n",
tp->q->ops->id, tp->prio & 0xffff,
ntohs(tp->protocol));
@@ -1888,28 +1890,6 @@ reset:
}
EXPORT_SYMBOL(tc_classify);
-bool tcf_destroy(struct tcf_proto *tp, bool force)
-{
- if (tp->ops->destroy(tp, force)) {
- module_put(tp->ops->owner);
- kfree_rcu(tp, rcu);
- return true;
- }
-
- return false;
-}
-
-void tcf_destroy_chain(struct tcf_proto __rcu **fl)
-{
- struct tcf_proto *tp;
-
- while ((tp = rtnl_dereference(*fl)) != NULL) {
- RCU_INIT_POINTER(*fl, tp->next);
- tcf_destroy(tp, true);
- }
-}
-EXPORT_SYMBOL(tcf_destroy_chain);
-
#ifdef CONFIG_PROC_FS
static int psched_show(struct seq_file *seq, void *v)
{
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 481e4f12aeb4..2209c2ddacbf 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -15,6 +15,7 @@
#include <linux/file.h> /* for fput */
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
/*
* The ATM queuing discipline provides a framework for invoking classifiers
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index beb554aa8cfb..d6ca18dc04c3 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -19,6 +19,7 @@
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
/* Class-Based Queueing (CBQ) algorithm.
@@ -122,7 +123,7 @@ struct cbq_class {
psched_time_t penalized;
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct tc_cbq_xstats xstats;
struct tcf_proto __rcu *filter_list;
@@ -509,7 +510,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
if (delay) {
ktime_t time;
- time = ktime_set(0, 0);
+ time = 0;
time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
}
@@ -1346,7 +1347,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
return -1;
@@ -1405,7 +1406,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
tcf_destroy_chain(&cl->filter_list);
qdisc_destroy(cl->q);
qdisc_put_rtab(cl->R_tab);
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
if (cl != &q->link)
kfree(cl);
}
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 3b6d5bd69101..3b86a97bc67c 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -16,6 +16,7 @@
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
#include <net/inet_ecn.h>
#include <net/red.h>
#include <net/flow_dissector.h>
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 8af5c59eef84..bb4cbdf75004 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -25,7 +25,7 @@ struct drr_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct list_head alist;
struct Qdisc *qdisc;
@@ -142,7 +142,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
{
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
qdisc_destroy(cl->qdisc);
kfree(cl);
}
@@ -283,7 +283,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
return -1;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 1308bbf460f7..5334e309f17f 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -13,6 +13,7 @@
#include <linux/rtnetlink.h>
#include <linux/bitops.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
#include <asm/byteorder.h>
@@ -200,9 +201,13 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch,
pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p);
if (p->set_tc_index) {
+ int wlen = skb_network_offset(skb);
+
switch (tc_skb_protocol(skb)) {
case htons(ETH_P_IP):
- if (skb_cow_head(skb, sizeof(struct iphdr)))
+ wlen += sizeof(struct iphdr);
+ if (!pskb_may_pull(skb, wlen) ||
+ skb_try_make_writable(skb, wlen))
goto drop;
skb->tc_index = ipv4_get_dsfield(ip_hdr(skb))
@@ -210,7 +215,9 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch,
break;
case htons(ETH_P_IPV6):
- if (skb_cow_head(skb, sizeof(struct ipv6hdr)))
+ wlen += sizeof(struct ipv6hdr);
+ if (!pskb_may_pull(skb, wlen) ||
+ skb_try_make_writable(skb, wlen))
goto drop;
skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb))
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 18e752439f6f..a4f738ac7728 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -136,7 +136,7 @@ static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
struct fq_flow *aux;
parent = *p;
- aux = container_of(parent, struct fq_flow, rate_node);
+ aux = rb_entry(parent, struct fq_flow, rate_node);
if (f->time_next_packet >= aux->time_next_packet)
p = &parent->rb_right;
else
@@ -188,7 +188,7 @@ static void fq_gc(struct fq_sched_data *q,
while (*p) {
parent = *p;
- f = container_of(parent, struct fq_flow, fq_node);
+ f = rb_entry(parent, struct fq_flow, fq_node);
if (f->sk == sk)
break;
@@ -245,7 +245,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
skb_orphan(skb);
}
- root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
+ root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
if (q->flows >= (2U << q->fq_trees_log) &&
q->inactive_flows > q->flows/2)
@@ -256,7 +256,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
while (*p) {
parent = *p;
- f = container_of(parent, struct fq_flow, fq_node);
+ f = rb_entry(parent, struct fq_flow, fq_node);
if (f->sk == sk) {
/* socket might have been reallocated, so check
* if its sk_hash is the same.
@@ -424,7 +424,7 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
q->time_next_delayed_flow = ~0ULL;
while ((p = rb_first(&q->delayed)) != NULL) {
- struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
+ struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node);
if (f->time_next_packet > now) {
q->time_next_delayed_flow = f->time_next_packet;
@@ -563,7 +563,7 @@ static void fq_reset(struct Qdisc *sch)
for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
root = &q->fq_root[idx];
while ((p = rb_first(root)) != NULL) {
- f = container_of(p, struct fq_flow, fq_node);
+ f = rb_entry(p, struct fq_flow, fq_node);
rb_erase(p, root);
fq_flow_purge(f);
@@ -593,20 +593,20 @@ static void fq_rehash(struct fq_sched_data *q,
oroot = &old_array[idx];
while ((op = rb_first(oroot)) != NULL) {
rb_erase(op, oroot);
- of = container_of(op, struct fq_flow, fq_node);
+ of = rb_entry(op, struct fq_flow, fq_node);
if (fq_gc_candidate(of)) {
fcnt++;
kmem_cache_free(fq_flow_cachep, of);
continue;
}
- nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
+ nroot = &new_array[hash_ptr(of->sk, new_log)];
np = &nroot->rb_node;
parent = NULL;
while (*np) {
parent = *np;
- nf = container_of(parent, struct fq_flow, fq_node);
+ nf = rb_entry(parent, struct fq_flow, fq_node);
BUG_ON(nf->sk == of->sk);
if (nf->sk > of->sk)
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index a5ea0e9b6be4..9f3a884d1590 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -23,6 +23,7 @@
#include <linux/vmalloc.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
#include <net/codel.h>
#include <net/codel_impl.h>
#include <net/codel_qdisc.h>
@@ -57,7 +58,6 @@ struct fq_codel_sched_data {
struct fq_codel_flow *flows; /* Flows table [flows_cnt] */
u32 *backlogs; /* backlog table [flows_cnt] */
u32 flows_cnt; /* number of flows */
- u32 perturbation; /* hash perturbation */
u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
u32 drop_batch_size;
u32 memory_limit;
@@ -75,9 +75,7 @@ struct fq_codel_sched_data {
static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q,
struct sk_buff *skb)
{
- u32 hash = skb_get_hash_perturb(skb, q->perturbation);
-
- return reciprocal_scale(hash, q->flows_cnt);
+ return reciprocal_scale(skb_get_hash(skb), q->flows_cnt);
}
static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
@@ -482,7 +480,6 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
q->memory_limit = 32 << 20; /* 32 MBytes */
q->drop_batch_size = 64;
q->quantum = psched_mtu(qdisc_dev(sch));
- q->perturbation = prandom_u32();
INIT_LIST_HEAD(&q->new_flows);
INIT_LIST_HEAD(&q->old_flows);
codel_params_init(&q->cparams);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6cfb6e9038c2..1a2f9e964330 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
void __qdisc_run(struct Qdisc *q)
{
- int quota = weight_p;
+ int quota = dev_tx_weight;
int packets;
while (qdisc_restart(q, &packets)) {
@@ -709,7 +709,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
qdisc_put_stab(rtnl_dereference(qdisc->stab));
#endif
- gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
+ gen_kill_estimator(&qdisc->rate_est);
if (ops->reset)
ops->reset(qdisc);
if (ops->destroy)
@@ -794,7 +794,7 @@ static void attach_default_qdiscs(struct net_device *dev)
}
}
#ifdef CONFIG_NET_SCHED
- if (dev->qdisc)
+ if (dev->qdisc != &noop_qdisc)
qdisc_hash_add(dev->qdisc);
#endif
}
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 000f1d36128e..3ffaa6fb0990 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -114,7 +114,7 @@ struct hfsc_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct tcf_proto __rcu *filter_list; /* filter list */
unsigned int filter_cnt; /* filter count */
unsigned int level; /* class level in hierarchy */
@@ -1091,7 +1091,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
tcf_destroy_chain(&cl->filter_list);
qdisc_destroy(cl->qdisc);
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
if (cl != &q->root)
kfree(cl);
}
@@ -1348,7 +1348,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
xstats.rtwork = cl->cl_cumul;
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0)
return -1;
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index e3d0458af17b..2fae8b5f1b80 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -627,7 +627,9 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN *
sizeof(u32));
if (!q->hhf_arrays[i]) {
- hhf_destroy(sch);
+ /* Note: hhf_destroy() will be called
+ * by our caller.
+ */
return -ENOMEM;
}
}
@@ -638,7 +640,9 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN /
BITS_PER_BYTE);
if (!q->hhf_valid_bits[i]) {
- hhf_destroy(sch);
+ /* Note: hhf_destroy() will be called
+ * by our caller.
+ */
return -ENOMEM;
}
}
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index c798d0de8a9d..4cd5fb134bc9 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -40,6 +40,7 @@
#include <net/netlink.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
/* HTB algorithm.
Author: devik@cdi.cz
@@ -111,7 +112,7 @@ struct htb_class {
unsigned int children;
struct htb_class *parent; /* parent class */
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
/*
* Written often fields
@@ -1145,7 +1146,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
return -1;
@@ -1228,7 +1229,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
WARN_ON(!cl->un.leaf.q);
qdisc_destroy(cl->un.leaf.q);
}
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
tcf_destroy_chain(&cl->filter_list);
kfree(cl);
}
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 8fe6999b642a..3bab5f66c392 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -16,6 +16,7 @@
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
{
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 2bc8d7f8df16..20b7f1646f69 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -52,7 +52,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
/* pre-allocate qdiscs, attachment can't fail */
priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
GFP_KERNEL);
- if (priv->qdiscs == NULL)
+ if (!priv->qdiscs)
return -ENOMEM;
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
@@ -60,18 +60,14 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx),
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(ntx + 1)));
- if (qdisc == NULL)
- goto err;
+ if (!qdisc)
+ return -ENOMEM;
priv->qdiscs[ntx] = qdisc;
qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
}
sch->flags |= TCQ_F_MQROOT;
return 0;
-
-err:
- mq_destroy(sch);
- return -ENOMEM;
}
static void mq_attach(struct Qdisc *sch)
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index b5c502c78143..922683418e53 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -118,10 +118,8 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
/* pre-allocate qdisc, attachment can't fail */
priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
GFP_KERNEL);
- if (priv->qdiscs == NULL) {
- err = -ENOMEM;
- goto err;
- }
+ if (!priv->qdiscs)
+ return -ENOMEM;
for (i = 0; i < dev->num_tx_queues; i++) {
dev_queue = netdev_get_tx_queue(dev, i);
@@ -129,10 +127,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
get_default_qdisc_ops(dev, i),
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(i + 1)));
- if (qdisc == NULL) {
- err = -ENOMEM;
- goto err;
- }
+ if (!qdisc)
+ return -ENOMEM;
+
priv->qdiscs[i] = qdisc;
qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
}
@@ -148,7 +145,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
priv->hw_owned = 1;
err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc);
if (err)
- goto err;
+ return err;
} else {
netdev_set_num_tc(dev, qopt->num_tc);
for (i = 0; i < qopt->num_tc; i++)
@@ -162,10 +159,6 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
sch->flags |= TCQ_F_MQROOT;
return 0;
-
-err:
- mqprio_destroy(sch);
- return err;
}
static void mqprio_attach(struct Qdisc *sch)
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 9ffbb025b37e..e7839a0d0eaa 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -25,7 +25,7 @@
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
-
+#include <net/pkt_cls.h>
struct multiq_sched_data {
u16 bands;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 9f7b380cf0a3..c8bb62a1e744 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -152,7 +152,7 @@ struct netem_skb_cb {
static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
{
- return container_of(rb, struct sk_buff, rbnode);
+ return rb_entry(rb, struct sk_buff, rbnode);
}
static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
@@ -626,8 +626,8 @@ deliver:
* If it's at ingress let's pretend the delay is
* from the network (tstamp will be updated).
*/
- if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
- skb->tstamp.tv64 = 0;
+ if (skb->tc_redirected && skb->tc_from_ingress)
+ skb->tstamp = 0;
#endif
if (q->qdisc) {
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 8f575899adfa..d4d7db267b6e 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -20,7 +20,7 @@
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
-
+#include <net/pkt_cls.h>
struct prio_sched_data {
int bands;
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index ca0516e6f743..f9e712ce2d15 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -137,7 +137,7 @@ struct qfq_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct Qdisc *qdisc;
struct list_head alist; /* Link for active-classes list. */
struct qfq_aggregate *agg; /* Parent aggregate. */
@@ -508,7 +508,7 @@ set_change_agg:
new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
if (new_agg == NULL) {
err = -ENOBUFS;
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
goto destroy_class;
}
sch_tree_lock(sch);
@@ -533,7 +533,7 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
struct qfq_sched *q = qdisc_priv(sch);
qfq_rm_from_agg(q, cl);
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
qdisc_destroy(cl->qdisc);
kfree(cl);
}
@@ -667,7 +667,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL,
&cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
return -1;
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 20a350bd1b1d..fe6963d21519 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -25,6 +25,7 @@
#include <linux/jhash.h>
#include <net/ip.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
#include <net/inet_ecn.h>
/*
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 7f195ed4d568..42e8c8615e65 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -23,6 +23,7 @@
#include <linux/vmalloc.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
#include <net/red.h>
@@ -742,9 +743,10 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
if (!q->ht || !q->slots) {
- sfq_destroy(sch);
+ /* Note: sfq_destroy() will be called by our caller */
return -ENOMEM;
}
+
for (i = 0; i < q->divisor; i++)
q->ht[i] = SFQ_EMPTY_SLOT;
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 2cd9b4478b92..9fe6b427afed 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -401,8 +401,8 @@ static int teql_master_close(struct net_device *dev)
return 0;
}
-static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *stats)
+static void teql_master_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *stats)
{
struct teql_master *m = netdev_priv(dev);
@@ -410,7 +410,6 @@ static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
stats->tx_bytes = m->tx_bytes;
stats->tx_errors = m->tx_errors;
stats->tx_dropped = m->tx_dropped;
- return stats;
}
static int teql_master_mtu(struct net_device *dev, int new_mtu)
@@ -418,9 +417,6 @@ static int teql_master_mtu(struct net_device *dev, int new_mtu)
struct teql_master *m = netdev_priv(dev);
struct Qdisc *q;
- if (new_mtu < 68)
- return -EINVAL;
-
q = m->slaves;
if (q) {
do {
@@ -460,6 +456,8 @@ static __init void teql_master_setup(struct net_device *dev)
dev->netdev_ops = &teql_netdev_ops;
dev->type = ARPHRD_VOID;
dev->mtu = 1500;
+ dev->min_mtu = 68;
+ dev->max_mtu = 65535;
dev->tx_queue_len = 100;
dev->flags = IFF_NOARP;
dev->hard_header_len = LL_MAX_HEADER;
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 6c4f7496cec6..70f1b570bab9 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -11,7 +11,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
transport.o chunk.o sm_make_chunk.o ulpevent.o \
inqueue.o outqueue.o ulpqueue.o \
tsnmap.o bind_addr.o socket.o primitive.o \
- output.o input.o debug.o ssnmap.o auth.o \
+ output.o input.o debug.o stream.o auth.o \
offload.o
sctp_probe-y := probe.o
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index f10d3397f917..a9708da28eb5 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -71,9 +71,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
{
struct net *net = sock_net(sk);
struct sctp_sock *sp;
- int i;
sctp_paramhdr_t *p;
- int err;
+ int i;
/* Retrieve the SCTP per socket area. */
sp = sctp_sk((struct sock *)sk);
@@ -207,6 +206,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
* association to the same value as the initial TSN.
*/
asoc->addip_serial = asoc->c.initial_tsn;
+ asoc->strreset_outseq = asoc->c.initial_tsn;
INIT_LIST_HEAD(&asoc->addip_chunk_list);
INIT_LIST_HEAD(&asoc->asconf_ack_list);
@@ -246,6 +246,9 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
if (!sctp_ulpq_init(&asoc->ulpq, asoc))
goto fail_init;
+ if (sctp_stream_new(asoc, gfp))
+ goto fail_init;
+
/* Assume that peer would support both address types unless we are
* told otherwise.
*/
@@ -263,12 +266,13 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
/* AUTH related initializations */
INIT_LIST_HEAD(&asoc->endpoint_shared_keys);
- err = sctp_auth_asoc_copy_shkeys(ep, asoc, gfp);
- if (err)
- goto fail_init;
+ if (sctp_auth_asoc_copy_shkeys(ep, asoc, gfp))
+ goto stream_free;
asoc->active_key_id = ep->active_key_id;
asoc->prsctp_enable = ep->prsctp_enable;
+ asoc->reconf_enable = ep->reconf_enable;
+ asoc->strreset_enable = ep->strreset_enable;
/* Save the hmacs and chunks list into this association */
if (ep->auth_hmacs_list)
@@ -286,6 +290,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
return asoc;
+stream_free:
+ sctp_stream_free(asoc->stream);
fail_init:
sock_put(asoc->base.sk);
sctp_endpoint_put(asoc->ep);
@@ -358,8 +364,11 @@ void sctp_association_free(struct sctp_association *asoc)
sctp_tsnmap_free(&asoc->peer.tsn_map);
- /* Free ssnmap storage. */
- sctp_ssnmap_free(asoc->ssnmap);
+ /* Free stream information. */
+ sctp_stream_free(asoc->stream);
+
+ if (asoc->strreset_chunk)
+ sctp_chunk_free(asoc->strreset_chunk);
/* Clean up the bound address list. */
sctp_bind_addr_free(&asoc->base.bind_addr);
@@ -519,6 +528,12 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
if (asoc->peer.last_data_from == peer)
asoc->peer.last_data_from = transport;
+ if (asoc->strreset_chunk &&
+ asoc->strreset_chunk->transport == peer) {
+ asoc->strreset_chunk->transport = transport;
+ sctp_transport_reset_reconf_timer(transport);
+ }
+
/* If we remove the transport an INIT was last sent to, set it to
* NULL. Combined with the update of the retran path above, this
* will cause the next INIT to be sent to the next available
@@ -700,11 +715,15 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
/* Set the peer's active state. */
peer->state = peer_state;
+ /* Add this peer into the transport hashtable */
+ if (sctp_hash_transport(peer)) {
+ sctp_transport_free(peer);
+ return NULL;
+ }
+
/* Attach the remote transport to our asoc. */
list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list);
asoc->peer.transport_count++;
- /* Add this peer into the transport hashtable */
- sctp_hash_transport(peer);
/* If we do not yet have a primary path, set one. */
if (!asoc->peer.primary_path) {
@@ -816,8 +835,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
if (transport->state != SCTP_UNCONFIRMED)
transport->state = SCTP_INACTIVE;
else {
- dst_release(transport->dst);
- transport->dst = NULL;
+ sctp_transport_dst_release(transport);
ulp_notify = false;
}
@@ -1133,7 +1151,7 @@ void sctp_assoc_update(struct sctp_association *asoc,
/* Reinitialize SSN for both local streams
* and peer's streams.
*/
- sctp_ssnmap_clear(asoc->ssnmap);
+ sctp_stream_clear(asoc->stream);
/* Flush the ULP reassembly and ordered queue.
* Any data there will now be stale and will
@@ -1158,10 +1176,9 @@ void sctp_assoc_update(struct sctp_association *asoc,
asoc->ctsn_ack_point = asoc->next_tsn - 1;
asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
- if (!asoc->ssnmap) {
- /* Move the ssnmap. */
- asoc->ssnmap = new->ssnmap;
- new->ssnmap = NULL;
+ if (!asoc->stream) {
+ asoc->stream = new->stream;
+ new->stream = NULL;
}
if (!asoc->assoc_id) {
@@ -1395,7 +1412,7 @@ sctp_assoc_choose_alter_transport(struct sctp_association *asoc,
/* Update the association's pmtu and frag_point by going through all the
* transports. This routine is called when a transport's PMTU has changed.
*/
-void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc)
+void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
{
struct sctp_transport *t;
__u32 pmtu = 0;
@@ -1407,8 +1424,8 @@ void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc)
list_for_each_entry(t, &asoc->peer.transport_addr_list,
transports) {
if (t->pmtu_pending && t->dst) {
- sctp_transport_update_pmtu(sk, t,
- SCTP_TRUNC4(dst_mtu(t->dst)));
+ sctp_transport_update_pmtu(
+ t, SCTP_TRUNC4(dst_mtu(t->dst)));
t->pmtu_pending = 0;
}
if (!pmtu || (t->pathmtu < pmtu))
@@ -1467,7 +1484,7 @@ void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len)
* threshold. The idea is to recover slowly, but up
* to the initial advertised window.
*/
- if (asoc->rwnd_press && asoc->rwnd >= asoc->rwnd_press) {
+ if (asoc->rwnd_press) {
int change = min(asoc->pathmtu, asoc->rwnd_press);
asoc->rwnd += change;
asoc->rwnd_press -= change;
@@ -1535,7 +1552,7 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len)
asoc->rwnd = 0;
}
} else {
- asoc->rwnd_over = len - asoc->rwnd;
+ asoc->rwnd_over += len - asoc->rwnd;
asoc->rwnd = 0;
}
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index 401c60750b20..1ebc184a0e23 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -292,6 +292,8 @@ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
}
af->from_addr_param(&addr, rawaddr, htons(port), 0);
+ if (sctp_bind_addr_state(bp, &addr) != -1)
+ goto next;
retval = sctp_add_bind_addr(bp, &addr, sizeof(addr),
SCTP_ADDR_SRC, gfp);
if (retval) {
@@ -300,6 +302,7 @@ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
break;
}
+next:
len = ntohs(param->length);
addrs_len -= len;
raw_addr_list += len;
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 7a1cdf43e49d..e3621cb4827f 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -52,7 +52,6 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg)
atomic_set(&msg->refcnt, 1);
msg->send_failed = 0;
msg->send_error = 0;
- msg->can_abandon = 0;
msg->can_delay = 1;
msg->expires_at = 0;
INIT_LIST_HEAD(&msg->chunks);
@@ -166,14 +165,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
struct sctp_sndrcvinfo *sinfo,
struct iov_iter *from)
{
- int max, whole, i, offset, over, err;
- int len, first_len;
- int max_data;
+ size_t len, first_len, max_data, remaining;
+ size_t msg_len = iov_iter_count(from);
+ struct list_head *pos, *temp;
struct sctp_chunk *chunk;
struct sctp_datamsg *msg;
- struct list_head *pos, *temp;
- size_t msg_len = iov_iter_count(from);
- __u8 frag;
+ int err;
msg = sctp_datamsg_new(GFP_KERNEL);
if (!msg)
@@ -182,20 +179,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
/* Note: Calculate this outside of the loop, so that all fragments
* have the same expiration.
*/
- if (sinfo->sinfo_timetolive) {
- /* sinfo_timetolive is in milliseconds */
+ if (asoc->peer.prsctp_capable && sinfo->sinfo_timetolive &&
+ (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) ||
+ !SCTP_PR_POLICY(sinfo->sinfo_flags)))
msg->expires_at = jiffies +
- msecs_to_jiffies(sinfo->sinfo_timetolive);
- msg->can_abandon = 1;
-
- pr_debug("%s: msg:%p expires_at:%ld jiffies:%ld\n", __func__,
- msg, msg->expires_at, jiffies);
- }
-
- if (asoc->peer.prsctp_capable &&
- SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags))
- msg->expires_at =
- jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive);
+ msecs_to_jiffies(sinfo->sinfo_timetolive);
/* This is the biggest possible DATA chunk that can fit into
* the packet
@@ -205,7 +193,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
max_data = SCTP_TRUNC4(max_data);
- max = asoc->frag_point;
/* If the the peer requested that we authenticate DATA chunks
* we need to account for bundling of the AUTH chunks along with
* DATA.
@@ -218,12 +205,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
hmac_desc->hmac_len);
}
- /* Now, check if we need to reduce our max */
- if (max > max_data)
- max = max_data;
+ /* Check what's our max considering the above */
+ max_data = min_t(size_t, max_data, asoc->frag_point);
- whole = 0;
- first_len = max;
+ /* Set first_len and then account for possible bundles on first frag */
+ first_len = max_data;
/* Check to see if we have a pending SACK and try to let it be bundled
* with this message. Do this if we don't have any data queued already.
@@ -234,40 +220,38 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) &&
asoc->outqueue.out_qlen == 0 &&
list_empty(&asoc->outqueue.retransmit) &&
- msg_len > max)
- max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
+ msg_len > max_data)
+ first_len -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
/* Encourage Cookie-ECHO bundling. */
if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
- max_data -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
-
- /* Now that we adjusted completely, reset first_len */
- if (first_len > max_data)
- first_len = max_data;
+ first_len -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
/* Account for a different sized first fragment */
if (msg_len >= first_len) {
- msg_len -= first_len;
- whole = 1;
msg->can_delay = 0;
- }
-
- /* How many full sized? How many bytes leftover? */
- whole += msg_len / max;
- over = msg_len % max;
- offset = 0;
-
- if ((whole > 1) || (whole && over))
SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS);
+ } else {
+ /* Which may be the only one... */
+ first_len = msg_len;
+ }
- /* Create chunks for all the full sized DATA chunks. */
- for (i = 0, len = first_len; i < whole; i++) {
- frag = SCTP_DATA_MIDDLE_FRAG;
+ /* Create chunks for all DATA chunks. */
+ for (remaining = msg_len; remaining; remaining -= len) {
+ u8 frag = SCTP_DATA_MIDDLE_FRAG;
- if (0 == i)
+ if (remaining == msg_len) {
+ /* First frag, which may also be the last */
frag |= SCTP_DATA_FIRST_FRAG;
+ len = first_len;
+ } else {
+ /* Middle frags */
+ len = max_data;
+ }
- if ((i == (whole - 1)) && !over) {
+ if (len >= remaining) {
+ /* Last frag, which may also be the first */
+ len = remaining;
frag |= SCTP_DATA_LAST_FRAG;
/* The application requests to set the I-bit of the
@@ -281,7 +265,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag,
0, GFP_KERNEL);
-
if (!chunk) {
err = -ENOMEM;
goto errout;
@@ -292,45 +275,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
goto errout_chunk_free;
/* Put the chunk->skb back into the form expected by send. */
- __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
- - (__u8 *)chunk->skb->data);
-
- sctp_datamsg_assign(msg, chunk);
- list_add_tail(&chunk->frag_list, &msg->chunks);
-
- /* The first chunk, the first chunk was likely short
- * to allow bundling, so reset to full size.
- */
- if (0 == i)
- len = max;
- }
-
- /* .. now the leftover bytes. */
- if (over) {
- if (!whole)
- frag = SCTP_DATA_NOT_FRAG;
- else
- frag = SCTP_DATA_LAST_FRAG;
-
- if ((sinfo->sinfo_flags & SCTP_EOF) ||
- (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY))
- frag |= SCTP_DATA_SACK_IMM;
-
- chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag,
- 0, GFP_KERNEL);
-
- if (!chunk) {
- err = -ENOMEM;
- goto errout;
- }
-
- err = sctp_user_addto_chunk(chunk, over, from);
-
- /* Put the chunk->skb back into the form expected by send. */
- __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
- - (__u8 *)chunk->skb->data);
- if (err < 0)
- goto errout_chunk_free;
+ __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr -
+ chunk->skb->data);
sctp_datamsg_assign(msg, chunk);
list_add_tail(&chunk->frag_list, &msg->chunks);
@@ -348,24 +294,15 @@ errout:
sctp_chunk_free(chunk);
}
sctp_datamsg_put(msg);
+
return ERR_PTR(err);
}
/* Check whether this message has expired. */
int sctp_chunk_abandoned(struct sctp_chunk *chunk)
{
- if (!chunk->asoc->peer.prsctp_capable ||
- !SCTP_PR_POLICY(chunk->sinfo.sinfo_flags)) {
- struct sctp_datamsg *msg = chunk->msg;
-
- if (!msg->can_abandon)
- return 0;
-
- if (time_after(jiffies, msg->expires_at))
- return 1;
-
+ if (!chunk->asoc->peer.prsctp_capable)
return 0;
- }
if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) &&
time_after(jiffies, chunk->msg->expires_at)) {
@@ -378,6 +315,10 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
chunk->sent_count > chunk->sinfo.sinfo_timetolive) {
chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
return 1;
+ } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) &&
+ chunk->msg->expires_at &&
+ time_after(jiffies, chunk->msg->expires_at)) {
+ return 1;
}
/* PRIO policy is processed by sendmsg, not here */
diff --git a/net/sctp/debug.c b/net/sctp/debug.c
index 95d7b15dad21..2e47eb2f05cb 100644
--- a/net/sctp/debug.c
+++ b/net/sctp/debug.c
@@ -159,6 +159,7 @@ static const char *const sctp_timer_tbl[] = {
"TIMEOUT_T4_RTO",
"TIMEOUT_T5_SHUTDOWN_GUARD",
"TIMEOUT_HEARTBEAT",
+ "TIMEOUT_RECONF",
"TIMEOUT_SACK",
"TIMEOUT_AUTOCLOSE",
};
@@ -166,7 +167,9 @@ static const char *const sctp_timer_tbl[] = {
/* Lookup timer debug name. */
const char *sctp_tname(const sctp_subtype_t id)
{
- if (id.timeout <= SCTP_EVENT_TIMEOUT_MAX)
+ BUILD_BUG_ON(SCTP_EVENT_TIMEOUT_MAX + 1 != ARRAY_SIZE(sctp_timer_tbl));
+
+ if (id.timeout < ARRAY_SIZE(sctp_timer_tbl))
return sctp_timer_tbl[id.timeout];
return "unknown_timer";
}
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 1f03065686fe..8c589230794f 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -164,6 +164,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
ep->auth_hmacs_list = auth_hmacs;
ep->auth_chunk_list = auth_chunks;
ep->prsctp_enable = net->sctp.prsctp_enable;
+ ep->reconf_enable = net->sctp.reconf_enable;
return ep;
@@ -331,7 +332,9 @@ struct sctp_association *sctp_endpoint_lookup_assoc(
* on this endpoint.
*/
if (!ep->base.bind_addr.port)
- goto out;
+ return NULL;
+
+ rcu_read_lock();
t = sctp_epaddr_lookup_transport(ep, paddr);
if (!t)
goto out;
@@ -339,6 +342,7 @@ struct sctp_association *sctp_endpoint_lookup_assoc(
*transport = t;
asoc = t->asoc;
out:
+ rcu_read_unlock();
return asoc;
}
diff --git a/net/sctp/input.c b/net/sctp/input.c
index a01a56ec8b8c..0e06a278d2a9 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -401,10 +401,10 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
if (t->param_flags & SPP_PMTUD_ENABLE) {
/* Update transports view of the MTU */
- sctp_transport_update_pmtu(sk, t, pmtu);
+ sctp_transport_update_pmtu(t, pmtu);
/* Update association pmtu. */
- sctp_assoc_sync_pmtu(sk, asoc);
+ sctp_assoc_sync_pmtu(asoc);
}
/* Retransmit with the new pmtu setting.
@@ -790,10 +790,9 @@ hit:
/* rhashtable for transport */
struct sctp_hash_cmp_arg {
- const struct sctp_endpoint *ep;
- const union sctp_addr *laddr;
- const union sctp_addr *paddr;
- const struct net *net;
+ const union sctp_addr *paddr;
+ const struct net *net;
+ u16 lport;
};
static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
@@ -801,7 +800,6 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
{
struct sctp_transport *t = (struct sctp_transport *)ptr;
const struct sctp_hash_cmp_arg *x = arg->key;
- struct sctp_association *asoc;
int err = 1;
if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr))
@@ -809,19 +807,10 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
if (!sctp_transport_hold(t))
return err;
- asoc = t->asoc;
- if (!net_eq(sock_net(asoc->base.sk), x->net))
+ if (!net_eq(sock_net(t->asoc->base.sk), x->net))
+ goto out;
+ if (x->lport != htons(t->asoc->base.bind_addr.port))
goto out;
- if (x->ep) {
- if (x->ep != asoc->ep)
- goto out;
- } else {
- if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port))
- goto out;
- if (!sctp_bind_addr_match(&asoc->base.bind_addr,
- x->laddr, sctp_sk(asoc->base.sk)))
- goto out;
- }
err = 0;
out:
@@ -851,11 +840,9 @@ static inline u32 sctp_hash_key(const void *data, u32 len, u32 seed)
const struct sctp_hash_cmp_arg *x = data;
const union sctp_addr *paddr = x->paddr;
const struct net *net = x->net;
- u16 lport;
+ u16 lport = x->lport;
u32 addr;
- lport = x->ep ? htons(x->ep->base.bind_addr.port) :
- x->laddr->v4.sin_port;
if (paddr->sa.sa_family == AF_INET6)
addr = jhash(&paddr->v6.sin6_addr, 16, seed);
else
@@ -875,29 +862,48 @@ static const struct rhashtable_params sctp_hash_params = {
int sctp_transport_hashtable_init(void)
{
- return rhashtable_init(&sctp_transport_hashtable, &sctp_hash_params);
+ return rhltable_init(&sctp_transport_hashtable, &sctp_hash_params);
}
void sctp_transport_hashtable_destroy(void)
{
- rhashtable_destroy(&sctp_transport_hashtable);
+ rhltable_destroy(&sctp_transport_hashtable);
}
-void sctp_hash_transport(struct sctp_transport *t)
+int sctp_hash_transport(struct sctp_transport *t)
{
+ struct sctp_transport *transport;
+ struct rhlist_head *tmp, *list;
struct sctp_hash_cmp_arg arg;
+ int err;
if (t->asoc->temp)
- return;
+ return 0;
- arg.ep = t->asoc->ep;
- arg.paddr = &t->ipaddr;
arg.net = sock_net(t->asoc->base.sk);
+ arg.paddr = &t->ipaddr;
+ arg.lport = htons(t->asoc->base.bind_addr.port);
+
+ rcu_read_lock();
+ list = rhltable_lookup(&sctp_transport_hashtable, &arg,
+ sctp_hash_params);
+
+ rhl_for_each_entry_rcu(transport, tmp, list, node)
+ if (transport->asoc->ep == t->asoc->ep) {
+ rcu_read_unlock();
+ err = -EEXIST;
+ goto out;
+ }
+ rcu_read_unlock();
+
+ err = rhltable_insert_key(&sctp_transport_hashtable, &arg,
+ &t->node, sctp_hash_params);
+
+out:
+ if (err)
+ pr_err_once("insert transport fail, errno %d\n", err);
-reinsert:
- if (rhashtable_lookup_insert_key(&sctp_transport_hashtable, &arg,
- &t->node, sctp_hash_params) == -EBUSY)
- goto reinsert;
+ return err;
}
void sctp_unhash_transport(struct sctp_transport *t)
@@ -905,39 +911,62 @@ void sctp_unhash_transport(struct sctp_transport *t)
if (t->asoc->temp)
return;
- rhashtable_remove_fast(&sctp_transport_hashtable, &t->node,
- sctp_hash_params);
+ rhltable_remove(&sctp_transport_hashtable, &t->node,
+ sctp_hash_params);
}
+/* return a transport with holding it */
struct sctp_transport *sctp_addrs_lookup_transport(
struct net *net,
const union sctp_addr *laddr,
const union sctp_addr *paddr)
{
+ struct rhlist_head *tmp, *list;
+ struct sctp_transport *t;
struct sctp_hash_cmp_arg arg = {
- .ep = NULL,
- .laddr = laddr,
.paddr = paddr,
.net = net,
+ .lport = laddr->v4.sin_port,
};
- return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg,
- sctp_hash_params);
+ list = rhltable_lookup(&sctp_transport_hashtable, &arg,
+ sctp_hash_params);
+
+ rhl_for_each_entry_rcu(t, tmp, list, node) {
+ if (!sctp_transport_hold(t))
+ continue;
+
+ if (sctp_bind_addr_match(&t->asoc->base.bind_addr,
+ laddr, sctp_sk(t->asoc->base.sk)))
+ return t;
+ sctp_transport_put(t);
+ }
+
+ return NULL;
}
+/* return a transport without holding it, as it's only used under sock lock */
struct sctp_transport *sctp_epaddr_lookup_transport(
const struct sctp_endpoint *ep,
const union sctp_addr *paddr)
{
struct net *net = sock_net(ep->base.sk);
+ struct rhlist_head *tmp, *list;
+ struct sctp_transport *t;
struct sctp_hash_cmp_arg arg = {
- .ep = ep,
.paddr = paddr,
.net = net,
+ .lport = htons(ep->base.bind_addr.port),
};
- return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg,
- sctp_hash_params);
+ list = rhltable_lookup(&sctp_transport_hashtable, &arg,
+ sctp_hash_params);
+
+ rhl_for_each_entry_rcu(t, tmp, list, node)
+ if (ep == t->asoc->ep)
+ return t;
+
+ return NULL;
}
/* Look up an association. */
@@ -951,7 +980,7 @@ static struct sctp_association *__sctp_lookup_association(
struct sctp_association *asoc = NULL;
t = sctp_addrs_lookup_transport(net, local, peer);
- if (!t || !sctp_transport_hold(t))
+ if (!t)
goto out;
asoc = t->asoc;
@@ -1216,13 +1245,26 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net,
struct sctp_association *asoc;
asoc = __sctp_lookup_association(net, laddr, paddr, transportp);
+ if (asoc)
+ goto out;
/* Further lookup for INIT/INIT-ACK packets.
* SCTP Implementors Guide, 2.18 Handling of address
* parameters within the INIT or INIT-ACK.
*/
- if (!asoc)
- asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp);
+ asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp);
+ if (asoc)
+ goto out;
+ if (paddr->sa.sa_family == AF_INET)
+ pr_debug("sctp: asoc not found for src:%pI4:%d dst:%pI4:%d\n",
+ &laddr->v4.sin_addr, ntohs(laddr->v4.sin_port),
+ &paddr->v4.sin_addr, ntohs(paddr->v4.sin_port));
+ else
+ pr_debug("sctp: asoc not found for src:%pI6:%d dst:%pI6:%d\n",
+ &laddr->v6.sin6_addr, ntohs(laddr->v6.sin6_port),
+ &paddr->v6.sin6_addr, ntohs(paddr->v6.sin6_port));
+
+out:
return asoc;
}
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 176af3080a2b..961ee59f696a 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -71,7 +71,7 @@
#include <net/inet_ecn.h>
#include <net/sctp/sctp.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
static inline int sctp_v6_addr_match_len(union sctp_addr *s1,
union sctp_addr *s2);
@@ -222,7 +222,8 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS);
rcu_read_lock();
- res = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), np->tclass);
+ res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt),
+ np->tclass);
rcu_read_unlock();
return res;
}
@@ -412,22 +413,20 @@ static void sctp_v6_copy_addrlist(struct list_head *addrlist,
static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb,
int is_saddr)
{
- __be16 *port;
- struct sctphdr *sh;
+ /* Always called on head skb, so this is safe */
+ struct sctphdr *sh = sctp_hdr(skb);
+ struct sockaddr_in6 *sa = &addr->v6;
- port = &addr->v6.sin6_port;
addr->v6.sin6_family = AF_INET6;
addr->v6.sin6_flowinfo = 0; /* FIXME */
addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif;
- /* Always called on head skb, so this is safe */
- sh = sctp_hdr(skb);
if (is_saddr) {
- *port = sh->source;
- addr->v6.sin6_addr = ipv6_hdr(skb)->saddr;
+ sa->sin6_port = sh->source;
+ sa->sin6_addr = ipv6_hdr(skb)->saddr;
} else {
- *port = sh->dest;
- addr->v6.sin6_addr = ipv6_hdr(skb)->daddr;
+ sa->sin6_port = sh->dest;
+ sa->sin6_addr = ipv6_hdr(skb)->daddr;
}
}
@@ -641,14 +640,15 @@ static sctp_scope_t sctp_v6_scope(union sctp_addr *addr)
/* Create and initialize a new sk for the socket to be returned by accept(). */
static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
- struct sctp_association *asoc)
+ struct sctp_association *asoc,
+ bool kern)
{
struct sock *newsk;
struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
struct sctp6_sock *newsctp6sk;
struct ipv6_txoptions *opt;
- newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, 0);
+ newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, kern);
if (!newsk)
goto out;
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index 40e7fac96c41..105ac3327b28 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -51,7 +51,6 @@ SCTP_DBG_OBJCNT(bind_addr);
SCTP_DBG_OBJCNT(bind_bucket);
SCTP_DBG_OBJCNT(chunk);
SCTP_DBG_OBJCNT(addr);
-SCTP_DBG_OBJCNT(ssnmap);
SCTP_DBG_OBJCNT(datamsg);
SCTP_DBG_OBJCNT(keys);
@@ -67,7 +66,6 @@ static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = {
SCTP_DBG_OBJCNT_ENTRY(bind_addr),
SCTP_DBG_OBJCNT_ENTRY(bind_bucket),
SCTP_DBG_OBJCNT_ENTRY(addr),
- SCTP_DBG_OBJCNT_ENTRY(ssnmap),
SCTP_DBG_OBJCNT_ENTRY(datamsg),
SCTP_DBG_OBJCNT_ENTRY(keys),
};
diff --git a/net/sctp/offload.c b/net/sctp/offload.c
index 7e869d0cca69..4f5a2b580aa5 100644
--- a/net/sctp/offload.c
+++ b/net/sctp/offload.c
@@ -68,7 +68,7 @@ static struct sk_buff *sctp_gso_segment(struct sk_buff *skb,
goto out;
}
- segs = skb_segment(skb, features | NETIF_F_HW_CSUM);
+ segs = skb_segment(skb, features | NETIF_F_HW_CSUM | NETIF_F_SG);
if (IS_ERR(segs))
goto out;
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 6cb0df859195..1409a875ad8e 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -81,56 +81,64 @@ static void sctp_packet_reset(struct sctp_packet *packet)
/* Config a packet.
* This appears to be a followup set of initializations.
*/
-struct sctp_packet *sctp_packet_config(struct sctp_packet *packet,
- __u32 vtag, int ecn_capable)
+void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
+ int ecn_capable)
{
struct sctp_transport *tp = packet->transport;
struct sctp_association *asoc = tp->asoc;
+ struct sock *sk;
pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag);
-
packet->vtag = vtag;
- if (asoc && tp->dst) {
- struct sock *sk = asoc->base.sk;
-
- rcu_read_lock();
- if (__sk_dst_get(sk) != tp->dst) {
- dst_hold(tp->dst);
- sk_setup_caps(sk, tp->dst);
- }
-
- if (sk_can_gso(sk)) {
- struct net_device *dev = tp->dst->dev;
+ /* do the following jobs only once for a flush schedule */
+ if (!sctp_packet_empty(packet))
+ return;
- packet->max_size = dev->gso_max_size;
- } else {
- packet->max_size = asoc->pathmtu;
- }
- rcu_read_unlock();
+ /* set packet max_size with pathmtu */
+ packet->max_size = tp->pathmtu;
+ if (!asoc)
+ return;
- } else {
- packet->max_size = tp->pathmtu;
+ /* update dst or transport pathmtu if in need */
+ sk = asoc->base.sk;
+ if (!sctp_transport_dst_check(tp)) {
+ sctp_transport_route(tp, NULL, sctp_sk(sk));
+ if (asoc->param_flags & SPP_PMTUD_ENABLE)
+ sctp_assoc_sync_pmtu(asoc);
+ } else if (!sctp_transport_pmtu_check(tp)) {
+ if (asoc->param_flags & SPP_PMTUD_ENABLE)
+ sctp_assoc_sync_pmtu(asoc);
}
- if (ecn_capable && sctp_packet_empty(packet)) {
- struct sctp_chunk *chunk;
+ /* If there a is a prepend chunk stick it on the list before
+ * any other chunks get appended.
+ */
+ if (ecn_capable) {
+ struct sctp_chunk *chunk = sctp_get_ecne_prepend(asoc);
- /* If there a is a prepend chunk stick it on the list before
- * any other chunks get appended.
- */
- chunk = sctp_get_ecne_prepend(asoc);
if (chunk)
sctp_packet_append_chunk(packet, chunk);
}
- return packet;
+ if (!tp->dst)
+ return;
+
+ /* set packet max_size with gso_max_size if gso is enabled*/
+ rcu_read_lock();
+ if (__sk_dst_get(sk) != tp->dst) {
+ dst_hold(tp->dst);
+ sk_setup_caps(sk, tp->dst);
+ }
+ packet->max_size = sk_can_gso(sk) ? tp->dst->dev->gso_max_size
+ : asoc->pathmtu;
+ rcu_read_unlock();
}
/* Initialize the packet structure. */
-struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
- struct sctp_transport *transport,
- __u16 sport, __u16 dport)
+void sctp_packet_init(struct sctp_packet *packet,
+ struct sctp_transport *transport,
+ __u16 sport, __u16 dport)
{
struct sctp_association *asoc = transport->asoc;
size_t overhead;
@@ -151,8 +159,6 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
packet->overhead = overhead;
sctp_packet_reset(packet);
packet->vtag = 0;
-
- return packet;
}
/* Free a packet. */
@@ -181,7 +187,7 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
{
sctp_xmit_t retval;
- pr_debug("%s: packet:%p size:%Zu chunk:%p size:%d\n", __func__,
+ pr_debug("%s: packet:%p size:%zu chunk:%p size:%d\n", __func__,
packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1);
switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) {
@@ -399,186 +405,72 @@ static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk)
atomic_inc(&sk->sk_wmem_alloc);
}
-/* All packets are sent to the network through this function from
- * sctp_outq_tail().
- *
- * The return value is a normal kernel error return value.
- */
-int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
+static int sctp_packet_pack(struct sctp_packet *packet,
+ struct sk_buff *head, int gso, gfp_t gfp)
{
struct sctp_transport *tp = packet->transport;
- struct sctp_association *asoc = tp->asoc;
- struct sctphdr *sh;
- struct sk_buff *nskb = NULL, *head = NULL;
+ struct sctp_auth_chunk *auth = NULL;
struct sctp_chunk *chunk, *tmp;
- struct sock *sk;
- int err = 0;
- int padding; /* How much padding do we need? */
- int pkt_size;
- __u8 has_data = 0;
- int gso = 0;
- int pktcount = 0;
+ int pkt_count = 0, pkt_size;
+ struct sock *sk = head->sk;
+ struct sk_buff *nskb;
int auth_len = 0;
- struct dst_entry *dst;
- unsigned char *auth = NULL; /* pointer to auth in skb data */
-
- pr_debug("%s: packet:%p\n", __func__, packet);
-
- /* Do NOT generate a chunkless packet. */
- if (list_empty(&packet->chunk_list))
- return err;
- /* Set up convenience variables... */
- chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
- sk = chunk->skb->sk;
-
- /* Allocate the head skb, or main one if not in GSO */
- if (packet->size > tp->pathmtu && !packet->ipfragok) {
- if (sk_can_gso(sk)) {
- gso = 1;
- pkt_size = packet->overhead;
- } else {
- /* If this happens, we trash this packet and try
- * to build a new one, hopefully correct this
- * time. Application may notice this error.
- */
- pr_err_once("Trying to GSO but underlying device doesn't support it.");
- goto err;
- }
- } else {
- pkt_size = packet->size;
- }
- head = alloc_skb(pkt_size + MAX_HEADER, gfp);
- if (!head)
- goto err;
if (gso) {
- NAPI_GRO_CB(head)->last = head;
skb_shinfo(head)->gso_type = sk->sk_gso_type;
+ NAPI_GRO_CB(head)->last = head;
+ } else {
+ nskb = head;
+ pkt_size = packet->size;
+ goto merge;
}
- /* Make sure the outbound skb has enough header room reserved. */
- skb_reserve(head, packet->overhead + MAX_HEADER);
-
- /* Set the owning socket so that we know where to get the
- * destination IP address.
- */
- sctp_packet_set_owner_w(head, sk);
-
- if (!sctp_transport_dst_check(tp)) {
- sctp_transport_route(tp, NULL, sctp_sk(sk));
- if (asoc && (asoc->param_flags & SPP_PMTUD_ENABLE)) {
- sctp_assoc_sync_pmtu(sk, asoc);
- }
- }
- dst = dst_clone(tp->dst);
- if (!dst) {
- if (asoc)
- IP_INC_STATS(sock_net(asoc->base.sk),
- IPSTATS_MIB_OUTNOROUTES);
- goto nodst;
- }
- skb_dst_set(head, dst);
-
- /* Build the SCTP header. */
- sh = (struct sctphdr *)skb_push(head, sizeof(struct sctphdr));
- skb_reset_transport_header(head);
- sh->source = htons(packet->source_port);
- sh->dest = htons(packet->destination_port);
-
- /* From 6.8 Adler-32 Checksum Calculation:
- * After the packet is constructed (containing the SCTP common
- * header and one or more control or DATA chunks), the
- * transmitter shall:
- *
- * 1) Fill in the proper Verification Tag in the SCTP common
- * header and initialize the checksum field to 0's.
- */
- sh->vtag = htonl(packet->vtag);
- sh->checksum = 0;
-
- pr_debug("***sctp_transmit_packet***\n");
-
do {
- /* Set up convenience variables... */
- chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
- pktcount++;
+ /* calculate the pkt_size and alloc nskb */
+ pkt_size = packet->overhead;
+ list_for_each_entry_safe(chunk, tmp, &packet->chunk_list,
+ list) {
+ int padded = SCTP_PAD4(chunk->skb->len);
- /* Calculate packet size, so it fits in PMTU. Leave
- * other chunks for the next packets.
- */
- if (gso) {
- pkt_size = packet->overhead;
- list_for_each_entry(chunk, &packet->chunk_list, list) {
- int padded = SCTP_PAD4(chunk->skb->len);
-
- if (chunk == packet->auth)
- auth_len = padded;
- else if (auth_len + padded + packet->overhead >
- tp->pathmtu)
- goto nomem;
- else if (pkt_size + padded > tp->pathmtu)
- break;
- pkt_size += padded;
- }
-
- /* Allocate a new skb. */
- nskb = alloc_skb(pkt_size + MAX_HEADER, gfp);
- if (!nskb)
- goto nomem;
-
- /* Make sure the outbound skb has enough header
- * room reserved.
- */
- skb_reserve(nskb, packet->overhead + MAX_HEADER);
- } else {
- nskb = head;
+ if (chunk == packet->auth)
+ auth_len = padded;
+ else if (auth_len + padded + packet->overhead >
+ tp->pathmtu)
+ return 0;
+ else if (pkt_size + padded > tp->pathmtu)
+ break;
+ pkt_size += padded;
}
+ nskb = alloc_skb(pkt_size + MAX_HEADER, gfp);
+ if (!nskb)
+ return 0;
+ skb_reserve(nskb, packet->overhead + MAX_HEADER);
- /**
- * 3.2 Chunk Field Descriptions
- *
- * The total length of a chunk (including Type, Length and
- * Value fields) MUST be a multiple of 4 bytes. If the length
- * of the chunk is not a multiple of 4 bytes, the sender MUST
- * pad the chunk with all zero bytes and this padding is not
- * included in the chunk length field. The sender should
- * never pad with more than 3 bytes.
- *
- * [This whole comment explains SCTP_PAD4() below.]
- */
-
+merge:
+ /* merge chunks into nskb and append nskb into head list */
pkt_size -= packet->overhead;
list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+ int padding;
+
list_del_init(&chunk->list);
if (sctp_chunk_is_data(chunk)) {
- /* 6.3.1 C4) When data is in flight and when allowed
- * by rule C5, a new RTT measurement MUST be made each
- * round trip. Furthermore, new RTT measurements
- * SHOULD be made no more than once per round-trip
- * for a given destination transport address.
- */
-
- if (!chunk->resent && !tp->rto_pending) {
+ if (!sctp_chunk_retransmitted(chunk) &&
+ !tp->rto_pending) {
chunk->rtt_in_progress = 1;
tp->rto_pending = 1;
}
-
- has_data = 1;
}
padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len;
if (padding)
memset(skb_put(chunk->skb, padding), 0, padding);
- /* if this is the auth chunk that we are adding,
- * store pointer where it will be added and put
- * the auth into the packet.
- */
if (chunk == packet->auth)
- auth = skb_tail_pointer(nskb);
+ auth = (struct sctp_auth_chunk *)
+ skb_tail_pointer(nskb);
- memcpy(skb_put(nskb, chunk->skb->len),
- chunk->skb->data, chunk->skb->len);
+ memcpy(skb_put(nskb, chunk->skb->len), chunk->skb->data,
+ chunk->skb->len);
pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, rtt_in_progress:%d\n",
chunk,
@@ -588,11 +480,6 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
ntohs(chunk->chunk_hdr->length), chunk->skb->len,
chunk->rtt_in_progress);
- /* If this is a control chunk, this is our last
- * reference. Free data chunks after they've been
- * acknowledged or have failed.
- * Re-queue auth chunks if needed.
- */
pkt_size -= SCTP_PAD4(chunk->skb->len);
if (!sctp_chunk_is_data(chunk) && chunk != packet->auth)
@@ -602,160 +489,163 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
break;
}
- /* SCTP-AUTH, Section 6.2
- * The sender MUST calculate the MAC as described in RFC2104 [2]
- * using the hash function H as described by the MAC Identifier and
- * the shared association key K based on the endpoint pair shared key
- * described by the shared key identifier. The 'data' used for the
- * computation of the AUTH-chunk is given by the AUTH chunk with its
- * HMAC field set to zero (as shown in Figure 6) followed by all
- * chunks that are placed after the AUTH chunk in the SCTP packet.
- */
- if (auth)
- sctp_auth_calculate_hmac(asoc, nskb,
- (struct sctp_auth_chunk *)auth,
- gfp);
-
- if (packet->auth) {
- if (!list_empty(&packet->chunk_list)) {
- /* We will generate more packets, so re-queue
- * auth chunk.
- */
+ if (auth) {
+ sctp_auth_calculate_hmac(tp->asoc, nskb, auth, gfp);
+ /* free auth if no more chunks, or add it back */
+ if (list_empty(&packet->chunk_list))
+ sctp_chunk_free(packet->auth);
+ else
list_add(&packet->auth->list,
&packet->chunk_list);
- } else {
- sctp_chunk_free(packet->auth);
- packet->auth = NULL;
- }
}
- if (!gso)
- break;
-
- if (skb_gro_receive(&head, nskb)) {
- kfree_skb(nskb);
- goto nomem;
+ if (gso) {
+ if (skb_gro_receive(&head, nskb)) {
+ kfree_skb(nskb);
+ return 0;
+ }
+ if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >=
+ sk->sk_gso_max_segs))
+ return 0;
}
- nskb = NULL;
- if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >=
- sk->sk_gso_max_segs))
- goto nomem;
+
+ pkt_count++;
} while (!list_empty(&packet->chunk_list));
- /* 2) Calculate the Adler-32 checksum of the whole packet,
- * including the SCTP common header and all the
- * chunks.
- *
- * Note: Adler-32 is no longer applicable, as has been replaced
- * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
- *
- * If it's a GSO packet, it's postponed to sctp_skb_segment.
- */
- if (!sctp_checksum_disable || gso) {
- if (!gso && (!(dst->dev->features & NETIF_F_SCTP_CRC) ||
- dst_xfrm(dst) || packet->ipfragok)) {
- sh->checksum = sctp_compute_cksum(head, 0);
- } else {
- /* no need to seed pseudo checksum for SCTP */
- head->ip_summed = CHECKSUM_PARTIAL;
- head->csum_start = skb_transport_header(head) - head->head;
- head->csum_offset = offsetof(struct sctphdr, checksum);
+ if (gso) {
+ memset(head->cb, 0, max(sizeof(struct inet_skb_parm),
+ sizeof(struct inet6_skb_parm)));
+ skb_shinfo(head)->gso_segs = pkt_count;
+ skb_shinfo(head)->gso_size = GSO_BY_FRAGS;
+ rcu_read_lock();
+ if (skb_dst(head) != tp->dst) {
+ dst_hold(tp->dst);
+ sk_setup_caps(sk, tp->dst);
}
+ rcu_read_unlock();
+ goto chksum;
}
- /* IP layer ECN support
- * From RFC 2481
- * "The ECN-Capable Transport (ECT) bit would be set by the
- * data sender to indicate that the end-points of the
- * transport protocol are ECN-capable."
- *
- * Now setting the ECT bit all the time, as it should not cause
- * any problems protocol-wise even if our peer ignores it.
- *
- * Note: The works for IPv6 layer checks this bit too later
- * in transmission. See IP6_ECN_flow_xmit().
- */
- tp->af_specific->ecn_capable(sk);
+ if (sctp_checksum_disable)
+ return 1;
- /* Set up the IP options. */
- /* BUG: not implemented
- * For v4 this all lives somewhere in sk->sk_opt...
- */
+ if (!(skb_dst(head)->dev->features & NETIF_F_SCTP_CRC) ||
+ dst_xfrm(skb_dst(head)) || packet->ipfragok) {
+ struct sctphdr *sh =
+ (struct sctphdr *)skb_transport_header(head);
- /* Dump that on IP! */
- if (asoc) {
- asoc->stats.opackets += pktcount;
- if (asoc->peer.last_sent_to != tp)
- /* Considering the multiple CPU scenario, this is a
- * "correcter" place for last_sent_to. --xguo
- */
- asoc->peer.last_sent_to = tp;
+ sh->checksum = sctp_compute_cksum(head, 0);
+ } else {
+chksum:
+ head->ip_summed = CHECKSUM_PARTIAL;
+ head->csum_start = skb_transport_header(head) - head->head;
+ head->csum_offset = offsetof(struct sctphdr, checksum);
}
- if (has_data) {
- struct timer_list *timer;
- unsigned long timeout;
+ return pkt_count;
+}
- /* Restart the AUTOCLOSE timer when sending data. */
- if (sctp_state(asoc, ESTABLISHED) &&
- asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) {
- timer = &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE];
- timeout = asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE];
+/* All packets are sent to the network through this function from
+ * sctp_outq_tail().
+ *
+ * The return value is always 0 for now.
+ */
+int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
+{
+ struct sctp_transport *tp = packet->transport;
+ struct sctp_association *asoc = tp->asoc;
+ struct sctp_chunk *chunk, *tmp;
+ int pkt_count, gso = 0;
+ struct dst_entry *dst;
+ struct sk_buff *head;
+ struct sctphdr *sh;
+ struct sock *sk;
+
+ pr_debug("%s: packet:%p\n", __func__, packet);
+ if (list_empty(&packet->chunk_list))
+ return 0;
+ chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
+ sk = chunk->skb->sk;
- if (!mod_timer(timer, jiffies + timeout))
- sctp_association_hold(asoc);
+ /* check gso */
+ if (packet->size > tp->pathmtu && !packet->ipfragok) {
+ if (!sk_can_gso(sk)) {
+ pr_err_once("Trying to GSO but underlying device doesn't support it.");
+ goto out;
}
+ gso = 1;
+ }
+
+ /* alloc head skb */
+ head = alloc_skb((gso ? packet->overhead : packet->size) +
+ MAX_HEADER, gfp);
+ if (!head)
+ goto out;
+ skb_reserve(head, packet->overhead + MAX_HEADER);
+ sctp_packet_set_owner_w(head, sk);
+
+ /* set sctp header */
+ sh = (struct sctphdr *)skb_push(head, sizeof(struct sctphdr));
+ skb_reset_transport_header(head);
+ sh->source = htons(packet->source_port);
+ sh->dest = htons(packet->destination_port);
+ sh->vtag = htonl(packet->vtag);
+ sh->checksum = 0;
+
+ /* drop packet if no dst */
+ dst = dst_clone(tp->dst);
+ if (!dst) {
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ kfree_skb(head);
+ goto out;
}
+ skb_dst_set(head, dst);
+ /* pack up chunks */
+ pkt_count = sctp_packet_pack(packet, head, gso, gfp);
+ if (!pkt_count) {
+ kfree_skb(head);
+ goto out;
+ }
pr_debug("***sctp_transmit_packet*** skb->len:%d\n", head->len);
- if (gso) {
- /* Cleanup our debris for IP stacks */
- memset(head->cb, 0, max(sizeof(struct inet_skb_parm),
- sizeof(struct inet6_skb_parm)));
+ /* start autoclose timer */
+ if (packet->has_data && sctp_state(asoc, ESTABLISHED) &&
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) {
+ struct timer_list *timer =
+ &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE];
+ unsigned long timeout =
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE];
- skb_shinfo(head)->gso_segs = pktcount;
- skb_shinfo(head)->gso_size = GSO_BY_FRAGS;
+ if (!mod_timer(timer, jiffies + timeout))
+ sctp_association_hold(asoc);
+ }
- /* We have to refresh this in case we are xmiting to
- * more than one transport at a time
- */
- rcu_read_lock();
- if (__sk_dst_get(sk) != tp->dst) {
- dst_hold(tp->dst);
- sk_setup_caps(sk, tp->dst);
- }
- rcu_read_unlock();
+ /* sctp xmit */
+ tp->af_specific->ecn_capable(sk);
+ if (asoc) {
+ asoc->stats.opackets += pkt_count;
+ if (asoc->peer.last_sent_to != tp)
+ asoc->peer.last_sent_to = tp;
}
head->ignore_df = packet->ipfragok;
- tp->af_specific->sctp_xmit(head, tp);
- goto out;
-
-nomem:
- if (packet->auth && list_empty(&packet->auth->list))
- sctp_chunk_free(packet->auth);
-
-nodst:
- /* FIXME: Returning the 'err' will effect all the associations
- * associated with a socket, although only one of the paths of the
- * association is unreachable.
- * The real failure of a transport or association can be passed on
- * to the user via notifications. So setting this error may not be
- * required.
+ if (tp->dst_pending_confirm)
+ skb_set_dst_pending_confirm(head, 1);
+ /* neighbour should be confirmed on successful transmission or
+ * positive error
*/
- /* err = -EHOSTUNREACH; */
- kfree_skb(head);
+ if (tp->af_specific->sctp_xmit(head, tp) >= 0 &&
+ tp->dst_pending_confirm)
+ tp->dst_pending_confirm = 0;
-err:
+out:
list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
list_del_init(&chunk->list);
if (!sctp_chunk_is_data(chunk))
sctp_chunk_free(chunk);
}
-
-out:
sctp_packet_reset(packet);
- return err;
+ return 0;
}
/********************************************************************
@@ -818,18 +708,15 @@ static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet,
* unacknowledged.
*/
- if (sctp_sk(asoc->base.sk)->nodelay)
- /* Nagle disabled */
+ if ((sctp_sk(asoc->base.sk)->nodelay || inflight == 0) &&
+ !asoc->force_delay)
+ /* Nothing unacked */
return SCTP_XMIT_OK;
if (!sctp_packet_empty(packet))
/* Append to packet */
return SCTP_XMIT_OK;
- if (inflight == 0)
- /* Nothing unacked */
- return SCTP_XMIT_OK;
-
if (!sctp_state(asoc, ESTABLISHED))
return SCTP_XMIT_OK;
@@ -871,9 +758,6 @@ static void sctp_packet_append_data(struct sctp_packet *packet,
rwnd = 0;
asoc->peer.rwnd = rwnd;
- /* Has been accepted for transmission. */
- if (!asoc->peer.prsctp_capable)
- chunk->msg->can_abandon = 0;
sctp_chunk_assign_tsn(chunk);
sctp_chunk_assign_ssn(chunk);
}
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 582585393d35..8081476ed313 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -382,17 +382,18 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
}
static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
- struct sctp_sndrcvinfo *sinfo,
- struct list_head *queue, int msg_len)
+ struct sctp_sndrcvinfo *sinfo, int msg_len)
{
+ struct sctp_outq *q = &asoc->outqueue;
struct sctp_chunk *chk, *temp;
- list_for_each_entry_safe(chk, temp, queue, list) {
+ list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) {
if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
continue;
list_del_init(&chk->list);
+ q->out_qlen -= chk->skb->len;
asoc->sent_cnt_removable--;
asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
@@ -431,9 +432,7 @@ void sctp_prsctp_prune(struct sctp_association *asoc,
return;
}
- sctp_prsctp_prune_unsent(asoc, sinfo,
- &asoc->outqueue.out_chunk_list,
- msg_len);
+ sctp_prsctp_prune_unsent(asoc, sinfo, msg_len);
}
/* Mark all the eligible packets on a transport for retransmission. */
@@ -507,8 +506,6 @@ void sctp_retransmit_mark(struct sctp_outq *q,
transport->rto_pending = 0;
}
- chunk->resent = 1;
-
/* Move the chunk to the retransmit queue. The chunks
* on the retransmit queue are always kept in order.
*/
@@ -917,22 +914,28 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
case SCTP_CID_ECN_ECNE:
case SCTP_CID_ASCONF:
case SCTP_CID_FWD_TSN:
+ case SCTP_CID_RECONF:
status = sctp_packet_transmit_chunk(packet, chunk,
one_packet, gfp);
if (status != SCTP_XMIT_OK) {
/* put the chunk back */
list_add(&chunk->list, &q->control_chunk_list);
- } else {
- asoc->stats.octrlchunks++;
- /* PR-SCTP C5) If a FORWARD TSN is sent, the
- * sender MUST assure that at least one T3-rtx
- * timer is running.
- */
- if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) {
- sctp_transport_reset_t3_rtx(transport);
- transport->last_time_sent = jiffies;
- }
+ break;
}
+
+ asoc->stats.octrlchunks++;
+ /* PR-SCTP C5) If a FORWARD TSN is sent, the
+ * sender MUST assure that at least one T3-rtx
+ * timer is running.
+ */
+ if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) {
+ sctp_transport_reset_t3_rtx(transport);
+ transport->last_time_sent = jiffies;
+ }
+
+ if (chunk == asoc->strreset_chunk)
+ sctp_transport_reset_reconf_timer(transport);
+
break;
default:
@@ -1018,11 +1021,12 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
/* Finally, transmit new packets. */
while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
+ __u32 sid = ntohs(chunk->subh.data_hdr->stream);
+
/* RFC 2960 6.5 Every DATA chunk MUST carry a valid
* stream identifier.
*/
- if (chunk->sinfo.sinfo_stream >=
- asoc->c.sinit_num_ostreams) {
+ if (chunk->sinfo.sinfo_stream >= asoc->stream->outcnt) {
/* Mark as failed send. */
sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM);
@@ -1040,6 +1044,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
continue;
}
+ if (asoc->stream->out[sid].state == SCTP_STREAM_CLOSED) {
+ sctp_outq_head_data(q, chunk);
+ goto sctp_flush_out;
+ }
+
/* If there is a specified transport, use it.
* Otherwise, we want to use the active path.
*/
@@ -1050,7 +1059,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
(new_transport->state == SCTP_PF)))
new_transport = asoc->peer.active_path;
if (new_transport->state == SCTP_UNCONFIRMED) {
- WARN_ONCE(1, "Atempt to send packet on unconfirmed path.");
+ WARN_ONCE(1, "Attempt to send packet on unconfirmed path.");
sctp_chunk_fail(chunk, 0);
sctp_chunk_free(chunk);
continue;
@@ -1439,7 +1448,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
* instance).
*/
if (!tchunk->tsn_gap_acked &&
- !tchunk->resent &&
+ !sctp_chunk_retransmitted(tchunk) &&
tchunk->rtt_in_progress) {
tchunk->rtt_in_progress = 0;
rtt = jiffies - tchunk->sent_at;
@@ -1643,7 +1652,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
if (forward_progress) {
if (transport->dst)
- dst_confirm(transport->dst);
+ sctp_transport_dst_confirm(transport);
}
}
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
index ab8d9f96a177..f0553a022859 100644
--- a/net/sctp/primitive.c
+++ b/net/sctp/primitive.c
@@ -211,3 +211,6 @@ DECLARE_PRIMITIVE(REQUESTHEARTBEAT);
*/
DECLARE_PRIMITIVE(ASCONF);
+
+/* RE-CONFIG 5.1 */
+DECLARE_PRIMITIVE(RECONF);
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 206377fe91ec..a0b29d43627f 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -361,8 +361,8 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
sctp_seq_dump_remote_addrs(seq, assoc);
seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d "
"%8d %8d %8d %8d",
- assoc->hbinterval, assoc->c.sinit_max_instreams,
- assoc->c.sinit_num_ostreams, assoc->max_retrans,
+ assoc->hbinterval, assoc->stream->incnt,
+ assoc->stream->outcnt, assoc->max_retrans,
assoc->init_retries, assoc->shutdown_retries,
assoc->rtx_data_chunks,
atomic_read(&sk->sk_wmem_alloc),
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 7b523e3f551f..989a900383b5 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -199,32 +199,40 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
sctp_scope_t scope, gfp_t gfp, int copy_flags)
{
struct sctp_sockaddr_entry *addr;
+ union sctp_addr laddr;
int error = 0;
rcu_read_lock();
list_for_each_entry_rcu(addr, &net->sctp.local_addr_list, list) {
if (!addr->valid)
continue;
- if (sctp_in_scope(net, &addr->a, scope)) {
- /* Now that the address is in scope, check to see if
- * the address type is really supported by the local
- * sock as well as the remote peer.
- */
- if ((((AF_INET == addr->a.sa.sa_family) &&
- (copy_flags & SCTP_ADDR4_PEERSUPP))) ||
- (((AF_INET6 == addr->a.sa.sa_family) &&
- (copy_flags & SCTP_ADDR6_ALLOWED) &&
- (copy_flags & SCTP_ADDR6_PEERSUPP)))) {
- error = sctp_add_bind_addr(bp, &addr->a,
- sizeof(addr->a),
- SCTP_ADDR_SRC, GFP_ATOMIC);
- if (error)
- goto end_copy;
- }
- }
+ if (!sctp_in_scope(net, &addr->a, scope))
+ continue;
+
+ /* Now that the address is in scope, check to see if
+ * the address type is really supported by the local
+ * sock as well as the remote peer.
+ */
+ if (addr->a.sa.sa_family == AF_INET &&
+ !(copy_flags & SCTP_ADDR4_PEERSUPP))
+ continue;
+ if (addr->a.sa.sa_family == AF_INET6 &&
+ (!(copy_flags & SCTP_ADDR6_ALLOWED) ||
+ !(copy_flags & SCTP_ADDR6_PEERSUPP)))
+ continue;
+
+ laddr = addr->a;
+ /* also works for setting ipv6 address port */
+ laddr.v4.sin_port = htons(bp->port);
+ if (sctp_bind_addr_state(bp, &laddr) != -1)
+ continue;
+
+ error = sctp_add_bind_addr(bp, &addr->a, sizeof(addr->a),
+ SCTP_ADDR_SRC, GFP_ATOMIC);
+ if (error)
+ break;
}
-end_copy:
rcu_read_unlock();
return error;
}
@@ -233,23 +241,19 @@ end_copy:
static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb,
int is_saddr)
{
- void *from;
- __be16 *port;
- struct sctphdr *sh;
+ /* Always called on head skb, so this is safe */
+ struct sctphdr *sh = sctp_hdr(skb);
+ struct sockaddr_in *sa = &addr->v4;
- port = &addr->v4.sin_port;
addr->v4.sin_family = AF_INET;
- /* Always called on head skb, so this is safe */
- sh = sctp_hdr(skb);
if (is_saddr) {
- *port = sh->source;
- from = &ip_hdr(skb)->saddr;
+ sa->sin_port = sh->source;
+ sa->sin_addr.s_addr = ip_hdr(skb)->saddr;
} else {
- *port = sh->dest;
- from = &ip_hdr(skb)->daddr;
+ sa->sin_port = sh->dest;
+ sa->sin_addr.s_addr = ip_hdr(skb)->daddr;
}
- memcpy(&addr->v4.sin_addr.s_addr, from, sizeof(struct in_addr));
}
/* Initialize an sctp_addr from a socket. */
@@ -571,10 +575,11 @@ static int sctp_v4_is_ce(const struct sk_buff *skb)
/* Create and initialize a new sk for the socket returned by accept(). */
static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
- struct sctp_association *asoc)
+ struct sctp_association *asoc,
+ bool kern)
{
struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL,
- sk->sk_prot, 0);
+ sk->sk_prot, kern);
struct inet_sock *newinet;
if (!newsk)
@@ -1258,6 +1263,9 @@ static int __net_init sctp_defaults_init(struct net *net)
/* Enable PR-SCTP by default. */
net->sctp.prsctp_enable = 1;
+ /* Disable RECONF by default. */
+ net->sctp.reconf_enable = 0;
+
/* Disable AUTH by default. */
net->sctp.auth_enable = 0;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 9e9690b7afe1..118faff6a332 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -270,6 +270,11 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
num_ext += 2;
}
+ if (asoc->reconf_enable) {
+ extensions[num_ext] = SCTP_CID_RECONF;
+ num_ext += 1;
+ }
+
if (sp->adaptation_ind)
chunksize += sizeof(aiparam);
@@ -434,6 +439,11 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
num_ext += 2;
}
+ if (asoc->peer.reconf_capable) {
+ extensions[num_ext] = SCTP_CID_RECONF;
+ num_ext += 1;
+ }
+
if (sp->adaptation_ind)
chunksize += sizeof(aiparam);
@@ -1536,7 +1546,7 @@ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk)
/* All fragments will be on the same stream */
sid = ntohs(chunk->subh.data_hdr->stream);
- stream = &chunk->asoc->ssnmap->out;
+ stream = chunk->asoc->stream;
/* Now assign the sequence number to the entire message.
* All fragments must have the same stream sequence number.
@@ -1547,9 +1557,9 @@ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk)
ssn = 0;
} else {
if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG)
- ssn = sctp_ssn_next(stream, sid);
+ ssn = sctp_ssn_next(stream, out, sid);
else
- ssn = sctp_ssn_peek(stream, sid);
+ ssn = sctp_ssn_peek(stream, out, sid);
}
lchunk->subh.data_hdr->ssn = htons(ssn);
@@ -1844,6 +1854,7 @@ no_hmac:
retval->next_tsn = retval->c.initial_tsn;
retval->ctsn_ack_point = retval->next_tsn - 1;
retval->addip_serial = retval->c.initial_tsn;
+ retval->strreset_outseq = retval->c.initial_tsn;
retval->adv_peer_ack_point = retval->ctsn_ack_point;
retval->peer.prsctp_capable = retval->c.prsctp_capable;
retval->peer.adaptation_ind = retval->c.adaptation_ind;
@@ -2011,6 +2022,11 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
for (i = 0; i < num_ext; i++) {
switch (param.ext->chunks[i]) {
+ case SCTP_CID_RECONF:
+ if (asoc->reconf_enable &&
+ !asoc->peer.reconf_capable)
+ asoc->peer.reconf_capable = 1;
+ break;
case SCTP_CID_FWD_TSN:
if (asoc->prsctp_enable && !asoc->peer.prsctp_capable)
asoc->peer.prsctp_capable = 1;
@@ -2387,6 +2403,8 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
asoc->peer.i.initial_tsn =
ntohl(peer_init->init_hdr.initial_tsn);
+ asoc->strreset_inseq = asoc->peer.i.initial_tsn;
+
/* Apply the upper bounds for output streams based on peer's
* number of inbound streams.
*/
@@ -2442,15 +2460,10 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
* association.
*/
if (!asoc->temp) {
- int error;
-
- asoc->ssnmap = sctp_ssnmap_new(asoc->c.sinit_max_instreams,
- asoc->c.sinit_num_ostreams, gfp);
- if (!asoc->ssnmap)
+ if (sctp_stream_init(asoc, gfp))
goto clean_up;
- error = sctp_assoc_set_id(asoc, gfp);
- if (error)
+ if (sctp_assoc_set_id(asoc, gfp))
goto clean_up;
}
@@ -3210,7 +3223,6 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
union sctp_params param;
sctp_addiphdr_t *hdr;
union sctp_addr_param *addr_param;
- sctp_addip_param_t *asconf_param;
struct sctp_chunk *asconf_ack;
__be16 err_code;
int length = 0;
@@ -3230,7 +3242,6 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
* asconf parameter.
*/
length = ntohs(addr_param->p.length);
- asconf_param = (void *)addr_param + length;
chunk_len -= length;
/* create an ASCONF_ACK chunk.
@@ -3317,8 +3328,7 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
local_bh_enable();
list_for_each_entry(transport, &asoc->peer.transport_addr_list,
transports) {
- dst_release(transport->dst);
- transport->dst = NULL;
+ sctp_transport_dst_release(transport);
}
break;
case SCTP_PARAM_DEL_IP:
@@ -3332,8 +3342,7 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
local_bh_enable();
list_for_each_entry(transport, &asoc->peer.transport_addr_list,
transports) {
- dst_release(transport->dst);
- transport->dst = NULL;
+ sctp_transport_dst_release(transport);
}
break;
default:
@@ -3526,3 +3535,323 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
return retval;
}
+
+/* RE-CONFIG 3.1 (RE-CONFIG chunk)
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Type = 130 | Chunk Flags | Chunk Length |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * \ \
+ * / Re-configuration Parameter /
+ * \ \
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * \ \
+ * / Re-configuration Parameter (optional) /
+ * \ \
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+static struct sctp_chunk *sctp_make_reconf(
+ const struct sctp_association *asoc,
+ int length)
+{
+ struct sctp_reconf_chunk *reconf;
+ struct sctp_chunk *retval;
+
+ retval = sctp_make_control(asoc, SCTP_CID_RECONF, 0, length,
+ GFP_ATOMIC);
+ if (!retval)
+ return NULL;
+
+ reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr;
+ retval->param_hdr.v = reconf->params;
+
+ return retval;
+}
+
+/* RE-CONFIG 4.1 (STREAM OUT RESET)
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Parameter Type = 13 | Parameter Length = 16 + 2 * N |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Re-configuration Request Sequence Number |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Re-configuration Response Sequence Number |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Sender's Last Assigned TSN |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Stream Number 1 (optional) | Stream Number 2 (optional) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * / ...... /
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Stream Number N-1 (optional) | Stream Number N (optional) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * RE-CONFIG 4.2 (STREAM IN RESET)
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Parameter Type = 14 | Parameter Length = 8 + 2 * N |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Re-configuration Request Sequence Number |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Stream Number 1 (optional) | Stream Number 2 (optional) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * / ...... /
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Stream Number N-1 (optional) | Stream Number N (optional) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_req(
+ const struct sctp_association *asoc,
+ __u16 stream_num, __u16 *stream_list,
+ bool out, bool in)
+{
+ struct sctp_strreset_outreq outreq;
+ __u16 stream_len = stream_num * 2;
+ struct sctp_strreset_inreq inreq;
+ struct sctp_chunk *retval;
+ __u16 outlen, inlen;
+
+ outlen = (sizeof(outreq) + stream_len) * out;
+ inlen = (sizeof(inreq) + stream_len) * in;
+
+ retval = sctp_make_reconf(asoc, outlen + inlen);
+ if (!retval)
+ return NULL;
+
+ if (outlen) {
+ outreq.param_hdr.type = SCTP_PARAM_RESET_OUT_REQUEST;
+ outreq.param_hdr.length = htons(outlen);
+ outreq.request_seq = htonl(asoc->strreset_outseq);
+ outreq.response_seq = htonl(asoc->strreset_inseq - 1);
+ outreq.send_reset_at_tsn = htonl(asoc->next_tsn - 1);
+
+ sctp_addto_chunk(retval, sizeof(outreq), &outreq);
+
+ if (stream_len)
+ sctp_addto_chunk(retval, stream_len, stream_list);
+ }
+
+ if (inlen) {
+ inreq.param_hdr.type = SCTP_PARAM_RESET_IN_REQUEST;
+ inreq.param_hdr.length = htons(inlen);
+ inreq.request_seq = htonl(asoc->strreset_outseq + out);
+
+ sctp_addto_chunk(retval, sizeof(inreq), &inreq);
+
+ if (stream_len)
+ sctp_addto_chunk(retval, stream_len, stream_list);
+ }
+
+ return retval;
+}
+
+/* RE-CONFIG 4.3 (SSN/TSN RESET ALL)
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Parameter Type = 15 | Parameter Length = 8 |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Re-configuration Request Sequence Number |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_tsnreq(
+ const struct sctp_association *asoc)
+{
+ struct sctp_strreset_tsnreq tsnreq;
+ __u16 length = sizeof(tsnreq);
+ struct sctp_chunk *retval;
+
+ retval = sctp_make_reconf(asoc, length);
+ if (!retval)
+ return NULL;
+
+ tsnreq.param_hdr.type = SCTP_PARAM_RESET_TSN_REQUEST;
+ tsnreq.param_hdr.length = htons(length);
+ tsnreq.request_seq = htonl(asoc->strreset_outseq);
+
+ sctp_addto_chunk(retval, sizeof(tsnreq), &tsnreq);
+
+ return retval;
+}
+
+/* RE-CONFIG 4.5/4.6 (ADD STREAM)
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Parameter Type = 17 | Parameter Length = 12 |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Re-configuration Request Sequence Number |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Number of new streams | Reserved |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_addstrm(
+ const struct sctp_association *asoc,
+ __u16 out, __u16 in)
+{
+ struct sctp_strreset_addstrm addstrm;
+ __u16 size = sizeof(addstrm);
+ struct sctp_chunk *retval;
+
+ retval = sctp_make_reconf(asoc, (!!out + !!in) * size);
+ if (!retval)
+ return NULL;
+
+ if (out) {
+ addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_OUT_STREAMS;
+ addstrm.param_hdr.length = htons(size);
+ addstrm.number_of_streams = htons(out);
+ addstrm.request_seq = htonl(asoc->strreset_outseq);
+ addstrm.reserved = 0;
+
+ sctp_addto_chunk(retval, size, &addstrm);
+ }
+
+ if (in) {
+ addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_IN_STREAMS;
+ addstrm.param_hdr.length = htons(size);
+ addstrm.number_of_streams = htons(in);
+ addstrm.request_seq = htonl(asoc->strreset_outseq + !!out);
+ addstrm.reserved = 0;
+
+ sctp_addto_chunk(retval, size, &addstrm);
+ }
+
+ return retval;
+}
+
+/* RE-CONFIG 4.4 (RESP)
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Parameter Type = 16 | Parameter Length |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Re-configuration Response Sequence Number |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Result |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_resp(
+ const struct sctp_association *asoc,
+ __u32 result, __u32 sn)
+{
+ struct sctp_strreset_resp resp;
+ __u16 length = sizeof(resp);
+ struct sctp_chunk *retval;
+
+ retval = sctp_make_reconf(asoc, length);
+ if (!retval)
+ return NULL;
+
+ resp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE;
+ resp.param_hdr.length = htons(length);
+ resp.response_seq = htonl(sn);
+ resp.result = htonl(result);
+
+ sctp_addto_chunk(retval, sizeof(resp), &resp);
+
+ return retval;
+}
+
+/* RE-CONFIG 4.4 OPTIONAL (TSNRESP)
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Parameter Type = 16 | Parameter Length |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Re-configuration Response Sequence Number |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Result |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Sender's Next TSN (optional) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Receiver's Next TSN (optional) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_tsnresp(
+ struct sctp_association *asoc,
+ __u32 result, __u32 sn,
+ __u32 sender_tsn, __u32 receiver_tsn)
+{
+ struct sctp_strreset_resptsn tsnresp;
+ __u16 length = sizeof(tsnresp);
+ struct sctp_chunk *retval;
+
+ retval = sctp_make_reconf(asoc, length);
+ if (!retval)
+ return NULL;
+
+ tsnresp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE;
+ tsnresp.param_hdr.length = htons(length);
+
+ tsnresp.response_seq = htonl(sn);
+ tsnresp.result = htonl(result);
+ tsnresp.senders_next_tsn = htonl(sender_tsn);
+ tsnresp.receivers_next_tsn = htonl(receiver_tsn);
+
+ sctp_addto_chunk(retval, sizeof(tsnresp), &tsnresp);
+
+ return retval;
+}
+
+bool sctp_verify_reconf(const struct sctp_association *asoc,
+ struct sctp_chunk *chunk,
+ struct sctp_paramhdr **errp)
+{
+ struct sctp_reconf_chunk *hdr;
+ union sctp_params param;
+ __u16 last = 0, cnt = 0;
+
+ hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
+ sctp_walk_params(param, hdr, params) {
+ __u16 length = ntohs(param.p->length);
+
+ *errp = param.p;
+ if (cnt++ > 2)
+ return false;
+ switch (param.p->type) {
+ case SCTP_PARAM_RESET_OUT_REQUEST:
+ if (length < sizeof(struct sctp_strreset_outreq) ||
+ (last && last != SCTP_PARAM_RESET_RESPONSE &&
+ last != SCTP_PARAM_RESET_IN_REQUEST))
+ return false;
+ break;
+ case SCTP_PARAM_RESET_IN_REQUEST:
+ if (length < sizeof(struct sctp_strreset_inreq) ||
+ (last && last != SCTP_PARAM_RESET_OUT_REQUEST))
+ return false;
+ break;
+ case SCTP_PARAM_RESET_RESPONSE:
+ if ((length != sizeof(struct sctp_strreset_resp) &&
+ length != sizeof(struct sctp_strreset_resptsn)) ||
+ (last && last != SCTP_PARAM_RESET_RESPONSE &&
+ last != SCTP_PARAM_RESET_OUT_REQUEST))
+ return false;
+ break;
+ case SCTP_PARAM_RESET_TSN_REQUEST:
+ if (length !=
+ sizeof(struct sctp_strreset_tsnreq) || last)
+ return false;
+ break;
+ case SCTP_PARAM_RESET_ADD_IN_STREAMS:
+ if (length != sizeof(struct sctp_strreset_addstrm) ||
+ (last && last != SCTP_PARAM_RESET_ADD_OUT_STREAMS))
+ return false;
+ break;
+ case SCTP_PARAM_RESET_ADD_OUT_STREAMS:
+ if (length != sizeof(struct sctp_strreset_addstrm) ||
+ (last && last != SCTP_PARAM_RESET_ADD_IN_STREAMS))
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ last = param.p->type;
+ }
+
+ return true;
+}
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c345bf153bed..25384fa82ba9 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -436,6 +436,37 @@ out_unlock:
sctp_association_put(asoc);
}
+ /* Handle the timeout of the RE-CONFIG timer. */
+void sctp_generate_reconf_event(unsigned long data)
+{
+ struct sctp_transport *transport = (struct sctp_transport *)data;
+ struct sctp_association *asoc = transport->asoc;
+ struct sock *sk = asoc->base.sk;
+ struct net *net = sock_net(sk);
+ int error = 0;
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ pr_debug("%s: sock is busy\n", __func__);
+
+ /* Try again later. */
+ if (!mod_timer(&transport->reconf_timer, jiffies + (HZ / 20)))
+ sctp_transport_hold(transport);
+ goto out_unlock;
+ }
+
+ error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
+ SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_RECONF),
+ asoc->state, asoc->ep, asoc,
+ transport, GFP_ATOMIC);
+
+ if (error)
+ sk->sk_err = -error;
+
+out_unlock:
+ bh_unlock_sock(sk);
+ sctp_transport_put(transport);
+}
/* Inject a SACK Timeout event into the state machine. */
static void sctp_generate_sack_event(unsigned long data)
@@ -453,6 +484,7 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
sctp_generate_t4_rto_event,
sctp_generate_t5_shutdown_guard_event,
NULL,
+ NULL,
sctp_generate_sack_event,
sctp_generate_autoclose_event,
};
@@ -723,7 +755,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
* forward progress.
*/
if (t->dst)
- dst_confirm(t->dst);
+ sctp_transport_dst_confirm(t);
/* The receiver of the HEARTBEAT ACK should also perform an
* RTT measurement for that destination transport address
@@ -840,6 +872,10 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
if (!sctp_style(sk, UDP))
sk->sk_state_change(sk);
}
+
+ if (sctp_state(asoc, SHUTDOWN_PENDING) &&
+ !sctp_outq_is_empty(&asoc->outqueue))
+ sctp_outq_uncork(&asoc->outqueue, GFP_ATOMIC);
}
/* Helper function to delete an association. */
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8ec20a64a3f8..24c6ccce7539 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -160,23 +160,22 @@ static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
/* Small helper function that checks if the chunk length
* is of the appropriate length. The 'required_length' argument
* is set to be the size of a specific chunk we are testing.
- * Return Values: 1 = Valid length
- * 0 = Invalid length
+ * Return Values: true = Valid length
+ * false = Invalid length
*
*/
-static inline int
-sctp_chunk_length_valid(struct sctp_chunk *chunk,
- __u16 required_length)
+static inline bool
+sctp_chunk_length_valid(struct sctp_chunk *chunk, __u16 required_length)
{
__u16 chunk_length = ntohs(chunk->chunk_hdr->length);
/* Previously already marked? */
if (unlikely(chunk->pdiscard))
- return 0;
+ return false;
if (unlikely(chunk_length < required_length))
- return 0;
+ return false;
- return 1;
+ return true;
}
/**********************************************************
@@ -1022,6 +1021,34 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(struct net *net,
return SCTP_DISPOSITION_CONSUME;
}
+/* resend asoc strreset_chunk. */
+sctp_disposition_t sctp_sf_send_reconf(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type, void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_transport *transport = arg;
+
+ if (asoc->overall_error_count >= asoc->max_retrans) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+ SCTP_ERROR(ETIMEDOUT));
+ /* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+ SCTP_PERR(SCTP_ERROR_NO_ERROR));
+ SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+ SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+ return SCTP_DISPOSITION_DELETE_TCB;
+ }
+
+ sctp_chunk_hold(asoc->strreset_chunk);
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+ SCTP_CHUNK(asoc->strreset_chunk));
+ sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE, SCTP_TRANSPORT(transport));
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
/*
* Process an heartbeat request.
*
@@ -3237,36 +3264,34 @@ static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net,
struct sctp_chunk *abort;
packet = sctp_ootb_pkt_new(net, asoc, chunk);
+ if (!packet)
+ return SCTP_DISPOSITION_NOMEM;
- if (packet) {
- /* Make an ABORT. The T bit will be set if the asoc
- * is NULL.
- */
- abort = sctp_make_abort(asoc, chunk, 0);
- if (!abort) {
- sctp_ootb_pkt_free(packet);
- return SCTP_DISPOSITION_NOMEM;
- }
-
- /* Reflect vtag if T-Bit is set */
- if (sctp_test_T_bit(abort))
- packet->vtag = ntohl(chunk->sctp_hdr->vtag);
+ /* Make an ABORT. The T bit will be set if the asoc
+ * is NULL.
+ */
+ abort = sctp_make_abort(asoc, chunk, 0);
+ if (!abort) {
+ sctp_ootb_pkt_free(packet);
+ return SCTP_DISPOSITION_NOMEM;
+ }
- /* Set the skb to the belonging sock for accounting. */
- abort->skb->sk = ep->base.sk;
+ /* Reflect vtag if T-Bit is set */
+ if (sctp_test_T_bit(abort))
+ packet->vtag = ntohl(chunk->sctp_hdr->vtag);
- sctp_packet_append_chunk(packet, abort);
+ /* Set the skb to the belonging sock for accounting. */
+ abort->skb->sk = ep->base.sk;
- sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
- SCTP_PACKET(packet));
+ sctp_packet_append_chunk(packet, abort);
- SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+ sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+ SCTP_PACKET(packet));
- sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
- return SCTP_DISPOSITION_CONSUME;
- }
+ SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
- return SCTP_DISPOSITION_NOMEM;
+ sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ return SCTP_DISPOSITION_CONSUME;
}
/*
@@ -3503,45 +3528,43 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(struct net *net,
struct sctp_chunk *shut;
packet = sctp_ootb_pkt_new(net, asoc, chunk);
+ if (!packet)
+ return SCTP_DISPOSITION_NOMEM;
- if (packet) {
- /* Make an SHUTDOWN_COMPLETE.
- * The T bit will be set if the asoc is NULL.
- */
- shut = sctp_make_shutdown_complete(asoc, chunk);
- if (!shut) {
- sctp_ootb_pkt_free(packet);
- return SCTP_DISPOSITION_NOMEM;
- }
-
- /* Reflect vtag if T-Bit is set */
- if (sctp_test_T_bit(shut))
- packet->vtag = ntohl(chunk->sctp_hdr->vtag);
+ /* Make an SHUTDOWN_COMPLETE.
+ * The T bit will be set if the asoc is NULL.
+ */
+ shut = sctp_make_shutdown_complete(asoc, chunk);
+ if (!shut) {
+ sctp_ootb_pkt_free(packet);
+ return SCTP_DISPOSITION_NOMEM;
+ }
- /* Set the skb to the belonging sock for accounting. */
- shut->skb->sk = ep->base.sk;
+ /* Reflect vtag if T-Bit is set */
+ if (sctp_test_T_bit(shut))
+ packet->vtag = ntohl(chunk->sctp_hdr->vtag);
- sctp_packet_append_chunk(packet, shut);
+ /* Set the skb to the belonging sock for accounting. */
+ shut->skb->sk = ep->base.sk;
- sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
- SCTP_PACKET(packet));
+ sctp_packet_append_chunk(packet, shut);
- SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+ sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+ SCTP_PACKET(packet));
- /* If the chunk length is invalid, we don't want to process
- * the reset of the packet.
- */
- if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
- return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
- /* We need to discard the rest of the packet to prevent
- * potential bomming attacks from additional bundled chunks.
- * This is documented in SCTP Threats ID.
- */
+ /* If the chunk length is invalid, we don't want to process
+ * the reset of the packet.
+ */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
- }
- return SCTP_DISPOSITION_NOMEM;
+ /* We need to discard the rest of the packet to prevent
+ * potential bomming attacks from additional bundled chunks.
+ * This is documented in SCTP Threats ID.
+ */
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
}
/*
@@ -3811,6 +3834,60 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net,
return SCTP_DISPOSITION_DISCARD;
}
+/* RE-CONFIG Section 5.2 Upon reception of an RECONF Chunk. */
+sctp_disposition_t sctp_sf_do_reconf(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type, void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_paramhdr *err_param = NULL;
+ struct sctp_chunk *chunk = arg;
+ struct sctp_reconf_chunk *hdr;
+ union sctp_params param;
+
+ if (!sctp_vtag_verify(chunk, asoc)) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
+ SCTP_NULL());
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ }
+
+ /* Make sure that the RECONF chunk has a valid length. */
+ if (!sctp_chunk_length_valid(chunk, sizeof(*hdr)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ if (!sctp_verify_reconf(asoc, chunk, &err_param))
+ return sctp_sf_violation_paramlen(net, ep, asoc, type, arg,
+ (void *)err_param, commands);
+
+ hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
+ sctp_walk_params(param, hdr, params) {
+ struct sctp_chunk *reply = NULL;
+ struct sctp_ulpevent *ev = NULL;
+
+ if (param.p->type == SCTP_PARAM_RESET_OUT_REQUEST)
+ reply = sctp_process_strreset_outreq(
+ (struct sctp_association *)asoc, param, &ev);
+ else if (param.p->type == SCTP_PARAM_RESET_IN_REQUEST)
+ reply = sctp_process_strreset_inreq(
+ (struct sctp_association *)asoc, param, &ev);
+ /* More handles for other types will be added here, by now it
+ * just ignores other types.
+ */
+
+ if (ev)
+ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+ SCTP_ULPEVENT(ev));
+
+ if (reply)
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+ SCTP_CHUNK(reply));
+ }
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
/*
* PR-SCTP Section 3.6 Receiver Side Implementation of PR-SCTP
*
@@ -3844,6 +3921,9 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(struct net *net,
return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
}
+ if (!asoc->peer.prsctp_capable)
+ return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);
+
/* Make sure that the FORWARD_TSN chunk has valid length. */
if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
@@ -3866,7 +3946,7 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(struct net *net,
/* Silently discard the chunk if stream-id is not valid */
sctp_walk_fwdtsn(skip, chunk) {
- if (ntohs(skip->stream) >= asoc->c.sinit_max_instreams)
+ if (ntohs(skip->stream) >= asoc->stream->incnt)
goto discard_noforce;
}
@@ -3912,6 +3992,9 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn_fast(
return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
}
+ if (!asoc->peer.prsctp_capable)
+ return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);
+
/* Make sure that the FORWARD_TSN chunk has a valid length. */
if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
@@ -3934,7 +4017,7 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn_fast(
/* Silently discard the chunk if stream-id is not valid */
sctp_walk_fwdtsn(skip, chunk) {
- if (ntohs(skip->stream) >= asoc->c.sinit_max_instreams)
+ if (ntohs(skip->stream) >= asoc->stream->incnt)
goto gen_shutdown;
}
@@ -5162,6 +5245,19 @@ sctp_disposition_t sctp_sf_do_prm_asconf(struct net *net,
return SCTP_DISPOSITION_CONSUME;
}
+/* RE-CONFIG Section 5.1 RECONF Chunk Procedures */
+sctp_disposition_t sctp_sf_do_prm_reconf(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg, sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(chunk));
+ return SCTP_DISPOSITION_CONSUME;
+}
+
/*
* Ignore the primitive event
*
@@ -6036,8 +6132,9 @@ static struct sctp_packet *sctp_ootb_pkt_new(struct net *net,
sctp_transport_route(transport, (union sctp_addr *)&chunk->dest,
sctp_sk(net->sctp.ctl_sock));
- packet = sctp_packet_init(&transport->packet, transport, sport, dport);
- packet = sctp_packet_config(packet, vtag, 0);
+ packet = &transport->packet;
+ sctp_packet_init(packet, transport, sport, dport);
+ sctp_packet_config(packet, vtag, 0);
return packet;
@@ -6256,7 +6353,7 @@ static int sctp_eat_data(const struct sctp_association *asoc,
* and discard the DATA chunk.
*/
sid = ntohs(data_hdr->stream);
- if (sid >= asoc->c.sinit_max_instreams) {
+ if (sid >= asoc->stream->incnt) {
/* Mark tsn as received even though we drop it */
sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn));
@@ -6278,9 +6375,8 @@ static int sctp_eat_data(const struct sctp_association *asoc,
* and is invalid.
*/
ssn = ntohs(data_hdr->ssn);
- if (ordered && SSN_lt(ssn, sctp_ssn_peek(&asoc->ssnmap->in, sid))) {
+ if (ordered && SSN_lt(ssn, sctp_ssn_peek(asoc->stream, in, sid)))
return SCTP_IERROR_PROTO_VIOLATION;
- }
/* Send the data up to the user. Note: Schedule the
* SCTP_CMD_CHUNK_ULP cmd before the SCTP_CMD_GEN_SACK, as the SACK
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index a987d54b379c..419b18ebb056 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -482,6 +482,32 @@ static const sctp_sm_table_entry_t prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUN
TYPE_SCTP_FWD_TSN,
}; /*state_fn_t prsctp_chunk_event_table[][] */
+#define TYPE_SCTP_RECONF { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_reconf), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_reconf), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_RECONF */
+
+/* The primary index for this table is the chunk type.
+ * The secondary index for this table is the state.
+ */
+static const sctp_sm_table_entry_t reconf_chunk_event_table[SCTP_NUM_RECONF_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
+ TYPE_SCTP_RECONF,
+}; /*state_fn_t reconf_chunk_event_table[][] */
+
#define TYPE_SCTP_AUTH { \
/* SCTP_STATE_CLOSED */ \
TYPE_SCTP_FUNC(sctp_sf_ootb), \
@@ -643,6 +669,25 @@ chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = {
TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
} /* TYPE_SCTP_PRIMITIVE_ASCONF */
+#define TYPE_SCTP_PRIMITIVE_RECONF { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
+} /* TYPE_SCTP_PRIMITIVE_RECONF */
+
/* The primary index for this table is the primitive type.
* The secondary index for this table is the state.
*/
@@ -653,6 +698,7 @@ static const sctp_sm_table_entry_t primitive_event_table[SCTP_NUM_PRIMITIVE_TYPE
TYPE_SCTP_PRIMITIVE_SEND,
TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT,
TYPE_SCTP_PRIMITIVE_ASCONF,
+ TYPE_SCTP_PRIMITIVE_RECONF,
};
#define TYPE_SCTP_OTHER_NO_PENDING_TSN { \
@@ -888,6 +934,25 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_
TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
}
+#define TYPE_SCTP_EVENT_TIMEOUT_RECONF { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_send_reconf), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = {
TYPE_SCTP_EVENT_TIMEOUT_NONE,
TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE,
@@ -897,6 +962,7 @@ static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][S
TYPE_SCTP_EVENT_TIMEOUT_T4_RTO,
TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD,
TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT,
+ TYPE_SCTP_EVENT_TIMEOUT_RECONF,
TYPE_SCTP_EVENT_TIMEOUT_SACK,
TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE,
};
@@ -924,6 +990,10 @@ static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(struct net *net,
return &addip_chunk_event_table[1][state];
}
+ if (net->sctp.reconf_enable)
+ if (cid == SCTP_CID_RECONF)
+ return &reconf_chunk_event_table[0][state];
+
if (net->sctp.auth_enable) {
if (cid == SCTP_CID_AUTH)
return &auth_chunk_event_table[0][state];
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f23ad913dc7a..d9d4c92e06b3 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -57,6 +57,7 @@
#include <linux/kernel.h>
#include <linux/wait.h>
#include <linux/time.h>
+#include <linux/sched/signal.h>
#include <linux/ip.h>
#include <linux/capability.h>
#include <linux/fcntl.h>
@@ -235,8 +236,12 @@ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk,
sctp_assoc_t id)
{
struct sctp_association *addr_asoc = NULL, *id_asoc = NULL;
- struct sctp_transport *transport;
+ struct sctp_af *af = sctp_get_af_specific(addr->ss_family);
union sctp_addr *laddr = (union sctp_addr *)addr;
+ struct sctp_transport *transport;
+
+ if (!af || sctp_verify_addr(sk, laddr, af->sockaddr_len))
+ return NULL;
addr_asoc = sctp_endpoint_lookup_assoc(sctp_sk(sk)->ep,
laddr,
@@ -360,7 +365,7 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
}
}
- if (snum && snum < PROT_SOCK &&
+ if (snum && snum < inet_prot_sock(net) &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
return -EACCES;
@@ -588,7 +593,7 @@ static int sctp_send_asconf_add_ip(struct sock *sk,
list_for_each_entry(trans,
&asoc->peer.transport_addr_list, transports) {
/* Clear the source and route cache */
- dst_release(trans->dst);
+ sctp_transport_dst_release(trans);
trans->cwnd = min(4*asoc->pathmtu, max_t(__u32,
2*asoc->pathmtu, 4380));
trans->ssthresh = asoc->peer.i.a_rwnd;
@@ -839,7 +844,7 @@ skip_mkasconf:
*/
list_for_each_entry(transport, &asoc->peer.transport_addr_list,
transports) {
- dst_release(transport->dst);
+ sctp_transport_dst_release(transport);
sctp_transport_route(transport, NULL,
sctp_sk(asoc->base.sk));
}
@@ -1152,8 +1157,10 @@ static int __sctp_connect(struct sock *sk,
* accept new associations, but it SHOULD NOT
* be permitted to open new associations.
*/
- if (ep->base.bind_addr.port < PROT_SOCK &&
- !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) {
+ if (ep->base.bind_addr.port <
+ inet_prot_sock(net) &&
+ !ns_capable(net->user_ns,
+ CAP_NET_BIND_SERVICE)) {
err = -EACCES;
goto out_free;
}
@@ -1818,7 +1825,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
* but it SHOULD NOT be permitted to open new
* associations.
*/
- if (ep->base.bind_addr.port < PROT_SOCK &&
+ if (ep->base.bind_addr.port < inet_prot_sock(net) &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) {
err = -EACCES;
goto out_unlock;
@@ -1900,7 +1907,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
}
if (asoc->pmtu_pending)
- sctp_assoc_pending_pmtu(sk, asoc);
+ sctp_assoc_pending_pmtu(asoc);
/* If fragmentation is disabled and the message length exceeds the
* association fragmentation point, return EMSGSIZE. The I-D
@@ -1913,7 +1920,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
}
/* Check for invalid stream. */
- if (sinfo->sinfo_stream >= asoc->c.sinit_num_ostreams) {
+ if (sinfo->sinfo_stream >= asoc->stream->outcnt) {
err = -EINVAL;
goto out_free;
}
@@ -1958,6 +1965,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
err = PTR_ERR(datamsg);
goto out_free;
}
+ asoc->force_delay = !!(msg->msg_flags & MSG_MORE);
/* Now send the (possibly) fragmented message. */
list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
@@ -2427,10 +2435,9 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
if ((params->spp_flags & SPP_PMTUD_DISABLE) && params->spp_pathmtu) {
if (trans) {
trans->pathmtu = params->spp_pathmtu;
- sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc);
+ sctp_assoc_sync_pmtu(asoc);
} else if (asoc) {
asoc->pathmtu = params->spp_pathmtu;
- sctp_frag_point(asoc, params->spp_pathmtu);
} else {
sp->pathmtu = params->spp_pathmtu;
}
@@ -2444,7 +2451,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
(trans->param_flags & ~SPP_PMTUD) | pmtud_change;
if (update) {
sctp_transport_pmtu(trans, sctp_opt2sk(sp));
- sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc);
+ sctp_assoc_sync_pmtu(asoc);
}
} else if (asoc) {
asoc->param_flags =
@@ -3751,6 +3758,120 @@ out:
return retval;
}
+static int sctp_setsockopt_enable_strreset(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_association *asoc;
+ int retval = -EINVAL;
+
+ if (optlen != sizeof(params))
+ goto out;
+
+ if (copy_from_user(&params, optval, optlen)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ if (params.assoc_value & (~SCTP_ENABLE_STRRESET_MASK))
+ goto out;
+
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (asoc) {
+ asoc->strreset_enable = params.assoc_value;
+ } else if (!params.assoc_id) {
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ sp->ep->strreset_enable = params.assoc_value;
+ } else {
+ goto out;
+ }
+
+ retval = 0;
+
+out:
+ return retval;
+}
+
+static int sctp_setsockopt_reset_streams(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_reset_streams *params;
+ struct sctp_association *asoc;
+ int retval = -EINVAL;
+
+ if (optlen < sizeof(struct sctp_reset_streams))
+ return -EINVAL;
+
+ params = memdup_user(optval, optlen);
+ if (IS_ERR(params))
+ return PTR_ERR(params);
+
+ asoc = sctp_id2assoc(sk, params->srs_assoc_id);
+ if (!asoc)
+ goto out;
+
+ retval = sctp_send_reset_streams(asoc, params);
+
+out:
+ kfree(params);
+ return retval;
+}
+
+static int sctp_setsockopt_reset_assoc(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_association *asoc;
+ sctp_assoc_t associd;
+ int retval = -EINVAL;
+
+ if (optlen != sizeof(associd))
+ goto out;
+
+ if (copy_from_user(&associd, optval, optlen)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ asoc = sctp_id2assoc(sk, associd);
+ if (!asoc)
+ goto out;
+
+ retval = sctp_send_reset_assoc(asoc);
+
+out:
+ return retval;
+}
+
+static int sctp_setsockopt_add_streams(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_association *asoc;
+ struct sctp_add_streams params;
+ int retval = -EINVAL;
+
+ if (optlen != sizeof(params))
+ goto out;
+
+ if (copy_from_user(&params, optval, optlen)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ asoc = sctp_id2assoc(sk, params.sas_assoc_id);
+ if (!asoc)
+ goto out;
+
+ retval = sctp_send_add_streams(asoc, &params);
+
+out:
+ return retval;
+}
+
/* API 6.2 setsockopt(), getsockopt()
*
* Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3917,6 +4038,18 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
case SCTP_DEFAULT_PRINFO:
retval = sctp_setsockopt_default_prinfo(sk, optval, optlen);
break;
+ case SCTP_ENABLE_STREAM_RESET:
+ retval = sctp_setsockopt_enable_strreset(sk, optval, optlen);
+ break;
+ case SCTP_RESET_STREAMS:
+ retval = sctp_setsockopt_reset_streams(sk, optval, optlen);
+ break;
+ case SCTP_RESET_ASSOC:
+ retval = sctp_setsockopt_reset_assoc(sk, optval, optlen);
+ break;
+ case SCTP_ADD_STREAMS:
+ retval = sctp_setsockopt_add_streams(sk, optval, optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
@@ -3983,7 +4116,7 @@ static int sctp_disconnect(struct sock *sk, int flags)
* descriptor will be returned from accept() to represent the newly
* formed association.
*/
-static struct sock *sctp_accept(struct sock *sk, int flags, int *err)
+static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern)
{
struct sctp_sock *sp;
struct sctp_endpoint *ep;
@@ -4018,7 +4151,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err)
*/
asoc = list_entry(ep->asocs.next, struct sctp_association, asocs);
- newsk = sp->pf->create_accept_sk(sk, asoc);
+ newsk = sp->pf->create_accept_sk(sk, asoc, kern);
if (!newsk) {
error = -ENOMEM;
goto out;
@@ -4328,8 +4461,8 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
info->sctpi_rwnd = asoc->a_rwnd;
info->sctpi_unackdata = asoc->unack_data;
info->sctpi_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map);
- info->sctpi_instrms = asoc->c.sinit_max_instreams;
- info->sctpi_outstrms = asoc->c.sinit_num_ostreams;
+ info->sctpi_instrms = asoc->stream->incnt;
+ info->sctpi_outstrms = asoc->stream->outcnt;
list_for_each(pos, &asoc->base.inqueue.in_chunk_list)
info->sctpi_inqueue++;
list_for_each(pos, &asoc->outqueue.out_chunk_list)
@@ -4392,10 +4525,7 @@ int sctp_transport_walk_start(struct rhashtable_iter *iter)
{
int err;
- err = rhashtable_walk_init(&sctp_transport_hashtable, iter,
- GFP_KERNEL);
- if (err)
- return err;
+ rhltable_walk_enter(&sctp_transport_hashtable, iter);
err = rhashtable_walk_start(iter);
if (err && err != -EAGAIN) {
@@ -4475,18 +4605,17 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *),
const union sctp_addr *paddr, void *p)
{
struct sctp_transport *transport;
- int err = -ENOENT;
+ int err;
rcu_read_lock();
transport = sctp_addrs_lookup_transport(net, laddr, paddr);
- if (!transport || !sctp_transport_hold(transport))
- goto out;
-
rcu_read_unlock();
+ if (!transport)
+ return -ENOENT;
+
err = cb(transport, p);
sctp_transport_put(transport);
-out:
return err;
}
EXPORT_SYMBOL_GPL(sctp_transport_lookup_process);
@@ -4562,8 +4691,8 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
status.sstat_unackdata = asoc->unack_data;
status.sstat_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map);
- status.sstat_instrms = asoc->c.sinit_max_instreams;
- status.sstat_outstrms = asoc->c.sinit_num_ostreams;
+ status.sstat_instrms = asoc->stream->incnt;
+ status.sstat_outstrms = asoc->stream->outcnt;
status.sstat_fragmentation_point = asoc->frag_point;
status.sstat_primary.spinfo_assoc_id = sctp_assoc2id(transport->asoc);
memcpy(&status.sstat_primary.spinfo_address, &transport->ipaddr,
@@ -4734,6 +4863,12 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp)
if (!asoc)
return -EINVAL;
+ /* If there is a thread waiting on more sndbuf space for
+ * sending on this asoc, it cannot be peeled.
+ */
+ if (waitqueue_active(&asoc->wait))
+ return -EBUSY;
+
/* An association cannot be branched off from an already peeled-off
* socket, nor is this supported for tcp style sockets.
*/
@@ -6405,6 +6540,47 @@ out:
return retval;
}
+static int sctp_getsockopt_enable_strreset(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_association *asoc;
+ int retval = -EFAULT;
+
+ if (len < sizeof(params)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ len = sizeof(params);
+ if (copy_from_user(&params, optval, len))
+ goto out;
+
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (asoc) {
+ params.assoc_value = asoc->strreset_enable;
+ } else if (!params.assoc_id) {
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ params.assoc_value = sp->ep->strreset_enable;
+ } else {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ if (put_user(len, optlen))
+ goto out;
+
+ if (copy_to_user(optval, &params, len))
+ goto out;
+
+ retval = 0;
+
+out:
+ return retval;
+}
+
static int sctp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -6572,6 +6748,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
retval = sctp_getsockopt_pr_assocstatus(sk, len, optval,
optlen);
break;
+ case SCTP_ENABLE_STREAM_RESET:
+ retval = sctp_getsockopt_enable_strreset(sk, len, optval,
+ optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
@@ -6854,6 +7034,9 @@ int sctp_inet_listen(struct socket *sock, int backlog)
if (sock->state != SS_UNCONNECTED)
goto out;
+ if (!sctp_sstate(sk, LISTENING) && !sctp_sstate(sk, CLOSED))
+ goto out;
+
/* If backlog is zero, disable listening. */
if (!backlog) {
if (sctp_sstate(sk, CLOSED))
@@ -7426,7 +7609,6 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
*/
release_sock(sk);
current_timeo = schedule_timeout(current_timeo);
- BUG_ON(sk != asoc->base.sk);
lock_sock(sk);
*timeo_p = current_timeo;
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
deleted file mode 100644
index b9c8521c1a98..000000000000
--- a/net/sctp/ssnmap.c
+++ /dev/null
@@ -1,125 +0,0 @@
-/* SCTP kernel implementation
- * Copyright (c) 2003 International Business Machines, Corp.
- *
- * This file is part of the SCTP kernel implementation
- *
- * These functions manipulate sctp SSN tracker.
- *
- * This SCTP implementation is free software;
- * you can redistribute it and/or modify it under the terms of
- * the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This SCTP implementation is distributed in the hope that it
- * will be useful, but WITHOUT ANY WARRANTY; without even the implied
- * ************************
- * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- * See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with GNU CC; see the file COPYING. If not, see
- * <http://www.gnu.org/licenses/>.
- *
- * Please send any bug reports or fixes you make to the
- * email address(es):
- * lksctp developers <linux-sctp@vger.kernel.org>
- *
- * Written or modified by:
- * Jon Grimm <jgrimm@us.ibm.com>
- */
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <net/sctp/sctp.h>
-#include <net/sctp/sm.h>
-
-static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in,
- __u16 out);
-
-/* Storage size needed for map includes 2 headers and then the
- * specific needs of in or out streams.
- */
-static inline size_t sctp_ssnmap_size(__u16 in, __u16 out)
-{
- return sizeof(struct sctp_ssnmap) + (in + out) * sizeof(__u16);
-}
-
-
-/* Create a new sctp_ssnmap.
- * Allocate room to store at least 'len' contiguous TSNs.
- */
-struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
- gfp_t gfp)
-{
- struct sctp_ssnmap *retval;
- int size;
-
- size = sctp_ssnmap_size(in, out);
- if (size <= KMALLOC_MAX_SIZE)
- retval = kmalloc(size, gfp);
- else
- retval = (struct sctp_ssnmap *)
- __get_free_pages(gfp, get_order(size));
- if (!retval)
- goto fail;
-
- if (!sctp_ssnmap_init(retval, in, out))
- goto fail_map;
-
- SCTP_DBG_OBJCNT_INC(ssnmap);
-
- return retval;
-
-fail_map:
- if (size <= KMALLOC_MAX_SIZE)
- kfree(retval);
- else
- free_pages((unsigned long)retval, get_order(size));
-fail:
- return NULL;
-}
-
-
-/* Initialize a block of memory as a ssnmap. */
-static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in,
- __u16 out)
-{
- memset(map, 0x00, sctp_ssnmap_size(in, out));
-
- /* Start 'in' stream just after the map header. */
- map->in.ssn = (__u16 *)&map[1];
- map->in.len = in;
-
- /* Start 'out' stream just after 'in'. */
- map->out.ssn = &map->in.ssn[in];
- map->out.len = out;
-
- return map;
-}
-
-/* Clear out the ssnmap streams. */
-void sctp_ssnmap_clear(struct sctp_ssnmap *map)
-{
- size_t size;
-
- size = (map->in.len + map->out.len) * sizeof(__u16);
- memset(map->in.ssn, 0x00, size);
-}
-
-/* Dispose of a ssnmap. */
-void sctp_ssnmap_free(struct sctp_ssnmap *map)
-{
- int size;
-
- if (unlikely(!map))
- return;
-
- size = sctp_ssnmap_size(map->in.len, map->out.len);
- if (size <= KMALLOC_MAX_SIZE)
- kfree(map);
- else
- free_pages((unsigned long)map, get_order(size));
-
- SCTP_DBG_OBJCNT_DEC(ssnmap);
-}
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
new file mode 100644
index 000000000000..bbed997e1c5f
--- /dev/null
+++ b/net/sctp/stream.c
@@ -0,0 +1,506 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001 Intel Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp tsn mapping array.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * Xin Long <lucien.xin@gmail.com>
+ */
+
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+int sctp_stream_new(struct sctp_association *asoc, gfp_t gfp)
+{
+ struct sctp_stream *stream;
+ int i;
+
+ stream = kzalloc(sizeof(*stream), gfp);
+ if (!stream)
+ return -ENOMEM;
+
+ stream->outcnt = asoc->c.sinit_num_ostreams;
+ stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp);
+ if (!stream->out) {
+ kfree(stream);
+ return -ENOMEM;
+ }
+ for (i = 0; i < stream->outcnt; i++)
+ stream->out[i].state = SCTP_STREAM_OPEN;
+
+ asoc->stream = stream;
+
+ return 0;
+}
+
+int sctp_stream_init(struct sctp_association *asoc, gfp_t gfp)
+{
+ struct sctp_stream *stream = asoc->stream;
+ int i;
+
+ /* Initial stream->out size may be very big, so free it and alloc
+ * a new one with new outcnt to save memory.
+ */
+ kfree(stream->out);
+ stream->outcnt = asoc->c.sinit_num_ostreams;
+ stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp);
+ if (!stream->out)
+ goto nomem;
+
+ for (i = 0; i < stream->outcnt; i++)
+ stream->out[i].state = SCTP_STREAM_OPEN;
+
+ stream->incnt = asoc->c.sinit_max_instreams;
+ stream->in = kcalloc(stream->incnt, sizeof(*stream->in), gfp);
+ if (!stream->in) {
+ kfree(stream->out);
+ goto nomem;
+ }
+
+ return 0;
+
+nomem:
+ asoc->stream = NULL;
+ kfree(stream);
+
+ return -ENOMEM;
+}
+
+void sctp_stream_free(struct sctp_stream *stream)
+{
+ if (unlikely(!stream))
+ return;
+
+ kfree(stream->out);
+ kfree(stream->in);
+ kfree(stream);
+}
+
+void sctp_stream_clear(struct sctp_stream *stream)
+{
+ int i;
+
+ for (i = 0; i < stream->outcnt; i++)
+ stream->out[i].ssn = 0;
+
+ for (i = 0; i < stream->incnt; i++)
+ stream->in[i].ssn = 0;
+}
+
+static int sctp_send_reconf(struct sctp_association *asoc,
+ struct sctp_chunk *chunk)
+{
+ struct net *net = sock_net(asoc->base.sk);
+ int retval = 0;
+
+ retval = sctp_primitive_RECONF(net, asoc, chunk);
+ if (retval)
+ sctp_chunk_free(chunk);
+
+ return retval;
+}
+
+int sctp_send_reset_streams(struct sctp_association *asoc,
+ struct sctp_reset_streams *params)
+{
+ struct sctp_stream *stream = asoc->stream;
+ __u16 i, str_nums, *str_list;
+ struct sctp_chunk *chunk;
+ int retval = -EINVAL;
+ bool out, in;
+
+ if (!asoc->peer.reconf_capable ||
+ !(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) {
+ retval = -ENOPROTOOPT;
+ goto out;
+ }
+
+ if (asoc->strreset_outstanding) {
+ retval = -EINPROGRESS;
+ goto out;
+ }
+
+ out = params->srs_flags & SCTP_STREAM_RESET_OUTGOING;
+ in = params->srs_flags & SCTP_STREAM_RESET_INCOMING;
+ if (!out && !in)
+ goto out;
+
+ str_nums = params->srs_number_streams;
+ str_list = params->srs_stream_list;
+ if (out && str_nums)
+ for (i = 0; i < str_nums; i++)
+ if (str_list[i] >= stream->outcnt)
+ goto out;
+
+ if (in && str_nums)
+ for (i = 0; i < str_nums; i++)
+ if (str_list[i] >= stream->incnt)
+ goto out;
+
+ for (i = 0; i < str_nums; i++)
+ str_list[i] = htons(str_list[i]);
+
+ chunk = sctp_make_strreset_req(asoc, str_nums, str_list, out, in);
+
+ for (i = 0; i < str_nums; i++)
+ str_list[i] = ntohs(str_list[i]);
+
+ if (!chunk) {
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ if (out) {
+ if (str_nums)
+ for (i = 0; i < str_nums; i++)
+ stream->out[str_list[i]].state =
+ SCTP_STREAM_CLOSED;
+ else
+ for (i = 0; i < stream->outcnt; i++)
+ stream->out[i].state = SCTP_STREAM_CLOSED;
+ }
+
+ asoc->strreset_chunk = chunk;
+ sctp_chunk_hold(asoc->strreset_chunk);
+
+ retval = sctp_send_reconf(asoc, chunk);
+ if (retval) {
+ sctp_chunk_put(asoc->strreset_chunk);
+ asoc->strreset_chunk = NULL;
+ if (!out)
+ goto out;
+
+ if (str_nums)
+ for (i = 0; i < str_nums; i++)
+ stream->out[str_list[i]].state =
+ SCTP_STREAM_OPEN;
+ else
+ for (i = 0; i < stream->outcnt; i++)
+ stream->out[i].state = SCTP_STREAM_OPEN;
+
+ goto out;
+ }
+
+ asoc->strreset_outstanding = out + in;
+
+out:
+ return retval;
+}
+
+int sctp_send_reset_assoc(struct sctp_association *asoc)
+{
+ struct sctp_chunk *chunk = NULL;
+ int retval;
+ __u16 i;
+
+ if (!asoc->peer.reconf_capable ||
+ !(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ))
+ return -ENOPROTOOPT;
+
+ if (asoc->strreset_outstanding)
+ return -EINPROGRESS;
+
+ chunk = sctp_make_strreset_tsnreq(asoc);
+ if (!chunk)
+ return -ENOMEM;
+
+ /* Block further xmit of data until this request is completed */
+ for (i = 0; i < asoc->stream->outcnt; i++)
+ asoc->stream->out[i].state = SCTP_STREAM_CLOSED;
+
+ asoc->strreset_chunk = chunk;
+ sctp_chunk_hold(asoc->strreset_chunk);
+
+ retval = sctp_send_reconf(asoc, chunk);
+ if (retval) {
+ sctp_chunk_put(asoc->strreset_chunk);
+ asoc->strreset_chunk = NULL;
+
+ for (i = 0; i < asoc->stream->outcnt; i++)
+ asoc->stream->out[i].state = SCTP_STREAM_OPEN;
+
+ return retval;
+ }
+
+ asoc->strreset_outstanding = 1;
+
+ return 0;
+}
+
+int sctp_send_add_streams(struct sctp_association *asoc,
+ struct sctp_add_streams *params)
+{
+ struct sctp_stream *stream = asoc->stream;
+ struct sctp_chunk *chunk = NULL;
+ int retval = -ENOMEM;
+ __u32 outcnt, incnt;
+ __u16 out, in;
+
+ if (!asoc->peer.reconf_capable ||
+ !(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) {
+ retval = -ENOPROTOOPT;
+ goto out;
+ }
+
+ if (asoc->strreset_outstanding) {
+ retval = -EINPROGRESS;
+ goto out;
+ }
+
+ out = params->sas_outstrms;
+ in = params->sas_instrms;
+ outcnt = stream->outcnt + out;
+ incnt = stream->incnt + in;
+ if (outcnt > SCTP_MAX_STREAM || incnt > SCTP_MAX_STREAM ||
+ (!out && !in)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ if (out) {
+ struct sctp_stream_out *streamout;
+
+ streamout = krealloc(stream->out, outcnt * sizeof(*streamout),
+ GFP_KERNEL);
+ if (!streamout)
+ goto out;
+
+ memset(streamout + stream->outcnt, 0, out * sizeof(*streamout));
+ stream->out = streamout;
+ }
+
+ if (in) {
+ struct sctp_stream_in *streamin;
+
+ streamin = krealloc(stream->in, incnt * sizeof(*streamin),
+ GFP_KERNEL);
+ if (!streamin)
+ goto out;
+
+ memset(streamin + stream->incnt, 0, in * sizeof(*streamin));
+ stream->in = streamin;
+ }
+
+ chunk = sctp_make_strreset_addstrm(asoc, out, in);
+ if (!chunk)
+ goto out;
+
+ asoc->strreset_chunk = chunk;
+ sctp_chunk_hold(asoc->strreset_chunk);
+
+ retval = sctp_send_reconf(asoc, chunk);
+ if (retval) {
+ sctp_chunk_put(asoc->strreset_chunk);
+ asoc->strreset_chunk = NULL;
+ goto out;
+ }
+
+ stream->incnt = incnt;
+ stream->outcnt = outcnt;
+
+ asoc->strreset_outstanding = !!out + !!in;
+
+out:
+ return retval;
+}
+
+static sctp_paramhdr_t *sctp_chunk_lookup_strreset_param(
+ struct sctp_association *asoc, __u32 resp_seq)
+{
+ struct sctp_chunk *chunk = asoc->strreset_chunk;
+ struct sctp_reconf_chunk *hdr;
+ union sctp_params param;
+
+ if (ntohl(resp_seq) != asoc->strreset_outseq || !chunk)
+ return NULL;
+
+ hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
+ sctp_walk_params(param, hdr, params) {
+ /* sctp_strreset_tsnreq is actually the basic structure
+ * of all stream reconf params, so it's safe to use it
+ * to access request_seq.
+ */
+ struct sctp_strreset_tsnreq *req = param.v;
+
+ if (req->request_seq == resp_seq)
+ return param.v;
+ }
+
+ return NULL;
+}
+
+struct sctp_chunk *sctp_process_strreset_outreq(
+ struct sctp_association *asoc,
+ union sctp_params param,
+ struct sctp_ulpevent **evp)
+{
+ struct sctp_strreset_outreq *outreq = param.v;
+ struct sctp_stream *stream = asoc->stream;
+ __u16 i, nums, flags = 0, *str_p = NULL;
+ __u32 result = SCTP_STRRESET_DENIED;
+ __u32 request_seq;
+
+ request_seq = ntohl(outreq->request_seq);
+
+ if (ntohl(outreq->send_reset_at_tsn) >
+ sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map)) {
+ result = SCTP_STRRESET_IN_PROGRESS;
+ goto out;
+ }
+
+ if (request_seq > asoc->strreset_inseq) {
+ result = SCTP_STRRESET_ERR_BAD_SEQNO;
+ goto out;
+ } else if (request_seq == asoc->strreset_inseq) {
+ asoc->strreset_inseq++;
+ }
+
+ /* Check strreset_enable after inseq inc, as sender cannot tell
+ * the peer doesn't enable strreset after receiving response with
+ * result denied, as well as to keep consistent with bsd.
+ */
+ if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ))
+ goto out;
+
+ if (asoc->strreset_chunk) {
+ sctp_paramhdr_t *param_hdr;
+ struct sctp_transport *t;
+
+ param_hdr = sctp_chunk_lookup_strreset_param(
+ asoc, outreq->response_seq);
+ if (!param_hdr || param_hdr->type !=
+ SCTP_PARAM_RESET_IN_REQUEST) {
+ /* same process with outstanding isn't 0 */
+ result = SCTP_STRRESET_ERR_IN_PROGRESS;
+ goto out;
+ }
+
+ asoc->strreset_outstanding--;
+ asoc->strreset_outseq++;
+
+ if (!asoc->strreset_outstanding) {
+ t = asoc->strreset_chunk->transport;
+ if (del_timer(&t->reconf_timer))
+ sctp_transport_put(t);
+
+ sctp_chunk_put(asoc->strreset_chunk);
+ asoc->strreset_chunk = NULL;
+ }
+
+ flags = SCTP_STREAM_RESET_INCOMING_SSN;
+ }
+
+ nums = (ntohs(param.p->length) - sizeof(*outreq)) / 2;
+ if (nums) {
+ str_p = outreq->list_of_streams;
+ for (i = 0; i < nums; i++) {
+ if (ntohs(str_p[i]) >= stream->incnt) {
+ result = SCTP_STRRESET_ERR_WRONG_SSN;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < nums; i++)
+ stream->in[ntohs(str_p[i])].ssn = 0;
+ } else {
+ for (i = 0; i < stream->incnt; i++)
+ stream->in[i].ssn = 0;
+ }
+
+ result = SCTP_STRRESET_PERFORMED;
+
+ *evp = sctp_ulpevent_make_stream_reset_event(asoc,
+ flags | SCTP_STREAM_RESET_OUTGOING_SSN, nums, str_p,
+ GFP_ATOMIC);
+
+out:
+ return sctp_make_strreset_resp(asoc, result, request_seq);
+}
+
+struct sctp_chunk *sctp_process_strreset_inreq(
+ struct sctp_association *asoc,
+ union sctp_params param,
+ struct sctp_ulpevent **evp)
+{
+ struct sctp_strreset_inreq *inreq = param.v;
+ struct sctp_stream *stream = asoc->stream;
+ __u32 result = SCTP_STRRESET_DENIED;
+ struct sctp_chunk *chunk = NULL;
+ __u16 i, nums, *str_p;
+ __u32 request_seq;
+
+ request_seq = ntohl(inreq->request_seq);
+ if (request_seq > asoc->strreset_inseq) {
+ result = SCTP_STRRESET_ERR_BAD_SEQNO;
+ goto out;
+ } else if (request_seq == asoc->strreset_inseq) {
+ asoc->strreset_inseq++;
+ }
+
+ if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ))
+ goto out;
+
+ if (asoc->strreset_outstanding) {
+ result = SCTP_STRRESET_ERR_IN_PROGRESS;
+ goto out;
+ }
+
+ nums = (ntohs(param.p->length) - sizeof(*inreq)) / 2;
+ str_p = inreq->list_of_streams;
+ for (i = 0; i < nums; i++) {
+ if (ntohs(str_p[i]) >= stream->outcnt) {
+ result = SCTP_STRRESET_ERR_WRONG_SSN;
+ goto out;
+ }
+ }
+
+ chunk = sctp_make_strreset_req(asoc, nums, str_p, 1, 0);
+ if (!chunk)
+ goto out;
+
+ if (nums)
+ for (i = 0; i < nums; i++)
+ stream->out[ntohs(str_p[i])].state =
+ SCTP_STREAM_CLOSED;
+ else
+ for (i = 0; i < stream->outcnt; i++)
+ stream->out[i].state = SCTP_STREAM_CLOSED;
+
+ asoc->strreset_chunk = chunk;
+ asoc->strreset_outstanding = 1;
+ sctp_chunk_hold(asoc->strreset_chunk);
+
+ *evp = sctp_ulpevent_make_stream_reset_event(asoc,
+ SCTP_STREAM_RESET_INCOMING_SSN, nums, str_p, GFP_ATOMIC);
+
+out:
+ if (!chunk)
+ chunk = sctp_make_strreset_resp(asoc, result, request_seq);
+
+ return chunk;
+}
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index ce54dce13ddb..721eeebfcd8a 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -72,7 +72,7 @@ static struct sctp_transport *sctp_transport_init(struct net *net,
*/
peer->rto = msecs_to_jiffies(net->sctp.rto_initial);
- peer->last_time_heard = ktime_set(0, 0);
+ peer->last_time_heard = 0;
peer->last_time_ecne_reduced = jiffies;
peer->param_flags = SPP_HB_DISABLE |
@@ -88,9 +88,11 @@ static struct sctp_transport *sctp_transport_init(struct net *net,
INIT_LIST_HEAD(&peer->transports);
setup_timer(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event,
- (unsigned long)peer);
+ (unsigned long)peer);
setup_timer(&peer->hb_timer, sctp_generate_heartbeat_event,
- (unsigned long)peer);
+ (unsigned long)peer);
+ setup_timer(&peer->reconf_timer, sctp_generate_reconf_event,
+ (unsigned long)peer);
setup_timer(&peer->proto_unreach_timer,
sctp_generate_proto_unreach_event, (unsigned long)peer);
@@ -144,6 +146,9 @@ void sctp_transport_free(struct sctp_transport *transport)
if (del_timer(&transport->T3_rtx_timer))
sctp_transport_put(transport);
+ if (del_timer(&transport->reconf_timer))
+ sctp_transport_put(transport);
+
/* Delete the ICMP proto unreachable timer if it's active. */
if (del_timer(&transport->proto_unreach_timer))
sctp_association_put(transport->asoc);
@@ -211,6 +216,14 @@ void sctp_transport_reset_hb_timer(struct sctp_transport *transport)
sctp_transport_hold(transport);
}
+void sctp_transport_reset_reconf_timer(struct sctp_transport *transport)
+{
+ if (!timer_pending(&transport->reconf_timer))
+ if (!mod_timer(&transport->reconf_timer,
+ jiffies + transport->rto))
+ sctp_transport_hold(transport);
+}
+
/* This transport has been assigned to an association.
* Initialize fields from the association or from the sock itself.
* Register the reference count in the association.
@@ -227,7 +240,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
{
/* If we don't have a fresh route, look one up */
if (!transport->dst || transport->dst->obsolete) {
- dst_release(transport->dst);
+ sctp_transport_dst_release(transport);
transport->af_specific->get_dst(transport, &transport->saddr,
&transport->fl, sk);
}
@@ -238,14 +251,13 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
}
-void sctp_transport_update_pmtu(struct sock *sk, struct sctp_transport *t, u32 pmtu)
+void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
{
- struct dst_entry *dst;
+ struct dst_entry *dst = sctp_transport_dst_check(t);
if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n",
- __func__, pmtu,
- SCTP_DEFAULT_MINSEGMENT);
+ __func__, pmtu, SCTP_DEFAULT_MINSEGMENT);
/* Use default minimum segment size and disable
* pmtu discovery on this transport.
*/
@@ -254,17 +266,13 @@ void sctp_transport_update_pmtu(struct sock *sk, struct sctp_transport *t, u32 p
t->pathmtu = pmtu;
}
- dst = sctp_transport_dst_check(t);
- if (!dst)
- t->af_specific->get_dst(t, &t->saddr, &t->fl, sk);
-
if (dst) {
- dst->ops->update_pmtu(dst, sk, NULL, pmtu);
-
+ dst->ops->update_pmtu(dst, t->asoc->base.sk, NULL, pmtu);
dst = sctp_transport_dst_check(t);
- if (!dst)
- t->af_specific->get_dst(t, &t->saddr, &t->fl, sk);
}
+
+ if (!dst)
+ t->af_specific->get_dst(t, &t->saddr, &t->fl, t->asoc->base.sk);
}
/* Caches the dst entry and source address for a transport's destination
@@ -630,9 +638,7 @@ void sctp_transport_reset(struct sctp_transport *t)
t->srtt = 0;
t->rttvar = 0;
- /* Reset these additional varibles so that we have a clean
- * slate.
- */
+ /* Reset these additional variables so that we have a clean slate. */
t->partial_bytes_acked = 0;
t->flight_size = 0;
t->error_count = 0;
@@ -659,3 +665,17 @@ void sctp_transport_immediate_rtx(struct sctp_transport *t)
sctp_transport_hold(t);
}
}
+
+/* Drop dst */
+void sctp_transport_dst_release(struct sctp_transport *t)
+{
+ dst_release(t->dst);
+ t->dst = NULL;
+ t->dst_pending_confirm = 0;
+}
+
+/* Schedule neighbour confirm */
+void sctp_transport_dst_confirm(struct sctp_transport *t)
+{
+ t->dst_pending_confirm = 1;
+}
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index bea00058ce35..c8881bc542a0 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -854,6 +854,35 @@ struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event(
return event;
}
+struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event(
+ const struct sctp_association *asoc, __u16 flags, __u16 stream_num,
+ __u16 *stream_list, gfp_t gfp)
+{
+ struct sctp_stream_reset_event *sreset;
+ struct sctp_ulpevent *event;
+ struct sk_buff *skb;
+ int length, i;
+
+ length = sizeof(struct sctp_stream_reset_event) + 2 * stream_num;
+ event = sctp_ulpevent_new(length, MSG_NOTIFICATION, gfp);
+ if (!event)
+ return NULL;
+
+ skb = sctp_event2skb(event);
+ sreset = (struct sctp_stream_reset_event *)skb_put(skb, length);
+
+ sreset->strreset_type = SCTP_STREAM_RESET_EVENT;
+ sreset->strreset_flags = flags;
+ sreset->strreset_length = length;
+ sctp_ulpevent_set_owner(event, asoc);
+ sreset->strreset_assoc_id = sctp_assoc2id(asoc);
+
+ for (i = 0; i < stream_num; i++)
+ sreset->strreset_stream_list[i] = ntohs(stream_list[i]);
+
+ return event;
+}
+
/* Return the notification type, assuming this is a notification
* event.
*/
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 84d0fdaf7de9..aa3624d50278 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -760,11 +760,11 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
struct sk_buff_head *event_list;
struct sk_buff *pos, *tmp;
struct sctp_ulpevent *cevent;
- struct sctp_stream *in;
+ struct sctp_stream *stream;
__u16 sid, csid, cssn;
sid = event->stream;
- in = &ulpq->asoc->ssnmap->in;
+ stream = ulpq->asoc->stream;
event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev;
@@ -782,11 +782,11 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
if (csid < sid)
continue;
- if (cssn != sctp_ssn_peek(in, sid))
+ if (cssn != sctp_ssn_peek(stream, in, sid))
break;
- /* Found it, so mark in the ssnmap. */
- sctp_ssn_next(in, sid);
+ /* Found it, so mark in the stream. */
+ sctp_ssn_next(stream, in, sid);
__skb_unlink(pos, &ulpq->lobby);
@@ -849,7 +849,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
struct sctp_ulpevent *event)
{
__u16 sid, ssn;
- struct sctp_stream *in;
+ struct sctp_stream *stream;
/* Check if this message needs ordering. */
if (SCTP_DATA_UNORDERED & event->msg_flags)
@@ -858,10 +858,10 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
/* Note: The stream ID must be verified before this routine. */
sid = event->stream;
ssn = event->ssn;
- in = &ulpq->asoc->ssnmap->in;
+ stream = ulpq->asoc->stream;
/* Is this the expected SSN for this stream ID? */
- if (ssn != sctp_ssn_peek(in, sid)) {
+ if (ssn != sctp_ssn_peek(stream, in, sid)) {
/* We've received something out of order, so find where it
* needs to be placed. We order by stream and then by SSN.
*/
@@ -870,7 +870,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
}
/* Mark that the next chunk has been found. */
- sctp_ssn_next(in, sid);
+ sctp_ssn_next(stream, in, sid);
/* Go find any other chunks that were waiting for
* ordering.
@@ -888,12 +888,12 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
struct sk_buff *pos, *tmp;
struct sctp_ulpevent *cevent;
struct sctp_ulpevent *event;
- struct sctp_stream *in;
+ struct sctp_stream *stream;
struct sk_buff_head temp;
struct sk_buff_head *lobby = &ulpq->lobby;
__u16 csid, cssn;
- in = &ulpq->asoc->ssnmap->in;
+ stream = ulpq->asoc->stream;
/* We are holding the chunks by stream, by SSN. */
skb_queue_head_init(&temp);
@@ -912,7 +912,7 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
continue;
/* see if this ssn has been marked by skipping */
- if (!SSN_lt(cssn, sctp_ssn_peek(in, csid)))
+ if (!SSN_lt(cssn, sctp_ssn_peek(stream, in, csid)))
break;
__skb_unlink(pos, lobby);
@@ -932,8 +932,8 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
csid = cevent->stream;
cssn = cevent->ssn;
- if (csid == sid && cssn == sctp_ssn_peek(in, csid)) {
- sctp_ssn_next(in, csid);
+ if (csid == sid && cssn == sctp_ssn_peek(stream, in, csid)) {
+ sctp_ssn_next(stream, in, csid);
__skb_unlink(pos, lobby);
__skb_queue_tail(&temp, pos);
event = sctp_skb2event(pos);
@@ -955,17 +955,17 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
*/
void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn)
{
- struct sctp_stream *in;
+ struct sctp_stream *stream;
/* Note: The stream ID must be verified before this routine. */
- in = &ulpq->asoc->ssnmap->in;
+ stream = ulpq->asoc->stream;
/* Is this an old SSN? If so ignore. */
- if (SSN_lt(ssn, sctp_ssn_peek(in, sid)))
+ if (SSN_lt(ssn, sctp_ssn_peek(stream, in, sid)))
return;
/* Mark that we are no longer expecting this SSN or lower. */
- sctp_ssn_skip(in, sid, ssn);
+ sctp_ssn_skip(stream, in, sid, ssn);
/* Go find any other chunks that were waiting for
* ordering and deliver them if needed.
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
new file mode 100644
index 000000000000..c717ef0896aa
--- /dev/null
+++ b/net/smc/Kconfig
@@ -0,0 +1,20 @@
+config SMC
+ tristate "SMC socket protocol family"
+ depends on INET && INFINIBAND
+ ---help---
+ SMC-R provides a "sockets over RDMA" solution making use of
+ RDMA over Converged Ethernet (RoCE) technology to upgrade
+ AF_INET TCP connections transparently.
+ The Linux implementation of the SMC-R solution is designed as
+ a separate socket family SMC.
+
+ Select this option if you want to run SMC socket applications
+
+config SMC_DIAG
+ tristate "SMC: socket monitoring interface"
+ depends on SMC
+ ---help---
+ Support for SMC socket monitoring interface used by tools such as
+ smcss.
+
+ if unsure, say Y.
diff --git a/net/smc/Makefile b/net/smc/Makefile
new file mode 100644
index 000000000000..188104654b54
--- /dev/null
+++ b/net/smc/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_SMC) += smc.o
+obj-$(CONFIG_SMC_DIAG) += smc_diag.o
+smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
+smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
new file mode 100644
index 000000000000..093803786eac
--- /dev/null
+++ b/net/smc/af_smc.c
@@ -0,0 +1,1409 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * AF_SMC protocol family socket handler keeping the AF_INET sock address type
+ * applies to SOCK_STREAM sockets only
+ * offers an alternative communication option for TCP-protocol sockets
+ * applicable with RoCE-cards only
+ *
+ * Initial restrictions:
+ * - non-blocking connect postponed
+ * - IPv6 support postponed
+ * - support for alternate links postponed
+ * - partial support for non-blocking sockets only
+ * - support for urgent data postponed
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ * based on prototype from Frank Blaschka
+ */
+
+#define KMSG_COMPONENT "smc"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/inetdevice.h>
+#include <linux/workqueue.h>
+#include <linux/in.h>
+#include <linux/sched/signal.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/smc.h>
+
+#include "smc.h"
+#include "smc_clc.h"
+#include "smc_llc.h"
+#include "smc_cdc.h"
+#include "smc_core.h"
+#include "smc_ib.h"
+#include "smc_pnet.h"
+#include "smc_tx.h"
+#include "smc_rx.h"
+#include "smc_close.h"
+
+static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group
+ * creation
+ */
+
+struct smc_lgr_list smc_lgr_list = { /* established link groups */
+ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
+ .list = LIST_HEAD_INIT(smc_lgr_list.list),
+};
+
+static void smc_tcp_listen_work(struct work_struct *);
+
+static void smc_set_keepalive(struct sock *sk, int val)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
+}
+
+static struct smc_hashinfo smc_v4_hashinfo = {
+ .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
+};
+
+int smc_hash_sk(struct sock *sk)
+{
+ struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
+ struct hlist_head *head;
+
+ head = &h->ht;
+
+ write_lock_bh(&h->lock);
+ sk_add_node(sk, head);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ write_unlock_bh(&h->lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(smc_hash_sk);
+
+void smc_unhash_sk(struct sock *sk)
+{
+ struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
+
+ write_lock_bh(&h->lock);
+ if (sk_del_node_init(sk))
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ write_unlock_bh(&h->lock);
+}
+EXPORT_SYMBOL_GPL(smc_unhash_sk);
+
+struct proto smc_proto = {
+ .name = "SMC",
+ .owner = THIS_MODULE,
+ .keepalive = smc_set_keepalive,
+ .hash = smc_hash_sk,
+ .unhash = smc_unhash_sk,
+ .obj_size = sizeof(struct smc_sock),
+ .h.smc_hash = &smc_v4_hashinfo,
+ .slab_flags = SLAB_DESTROY_BY_RCU,
+};
+EXPORT_SYMBOL_GPL(smc_proto);
+
+static int smc_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = 0;
+
+ if (!sk)
+ goto out;
+
+ smc = smc_sk(sk);
+ sock_hold(sk);
+ if (sk->sk_state == SMC_LISTEN)
+ /* smc_close_non_accepted() is called and acquires
+ * sock lock for child sockets again
+ */
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+ else
+ lock_sock(sk);
+
+ if (smc->use_fallback) {
+ sk->sk_state = SMC_CLOSED;
+ sk->sk_state_change(sk);
+ } else {
+ rc = smc_close_active(smc);
+ sock_set_flag(sk, SOCK_DEAD);
+ sk->sk_shutdown |= SHUTDOWN_MASK;
+ }
+ if (smc->clcsock) {
+ sock_release(smc->clcsock);
+ smc->clcsock = NULL;
+ }
+
+ /* detach socket */
+ sock_orphan(sk);
+ sock->sk = NULL;
+ if (smc->use_fallback) {
+ schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
+ } else if (sk->sk_state == SMC_CLOSED) {
+ smc_conn_free(&smc->conn);
+ schedule_delayed_work(&smc->sock_put_work,
+ SMC_CLOSE_SOCK_PUT_DELAY);
+ }
+ sk->sk_prot->unhash(sk);
+ release_sock(sk);
+
+ sock_put(sk);
+out:
+ return rc;
+}
+
+static void smc_destruct(struct sock *sk)
+{
+ if (sk->sk_state != SMC_CLOSED)
+ return;
+ if (!sock_flag(sk, SOCK_DEAD))
+ return;
+
+ sk_refcnt_debug_dec(sk);
+}
+
+static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
+{
+ struct smc_sock *smc;
+ struct sock *sk;
+
+ sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
+ if (!sk)
+ return NULL;
+
+ sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
+ sk->sk_state = SMC_INIT;
+ sk->sk_destruct = smc_destruct;
+ sk->sk_protocol = SMCPROTO_SMC;
+ smc = smc_sk(sk);
+ INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
+ INIT_LIST_HEAD(&smc->accept_q);
+ spin_lock_init(&smc->accept_q_lock);
+ INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
+ sk->sk_prot->hash(sk);
+ sk_refcnt_debug_inc(sk);
+
+ return sk;
+}
+
+static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len)
+{
+ struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc;
+
+ smc = smc_sk(sk);
+
+ /* replicate tests from inet_bind(), to be safe wrt. future changes */
+ rc = -EINVAL;
+ if (addr_len < sizeof(struct sockaddr_in))
+ goto out;
+
+ rc = -EAFNOSUPPORT;
+ /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
+ if ((addr->sin_family != AF_INET) &&
+ ((addr->sin_family != AF_UNSPEC) ||
+ (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
+ goto out;
+
+ lock_sock(sk);
+
+ /* Check if socket is already active */
+ rc = -EINVAL;
+ if (sk->sk_state != SMC_INIT)
+ goto out_rel;
+
+ smc->clcsock->sk->sk_reuse = sk->sk_reuse;
+ rc = kernel_bind(smc->clcsock, uaddr, addr_len);
+
+out_rel:
+ release_sock(sk);
+out:
+ return rc;
+}
+
+static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
+ unsigned long mask)
+{
+ /* options we don't get control via setsockopt for */
+ nsk->sk_type = osk->sk_type;
+ nsk->sk_sndbuf = osk->sk_sndbuf;
+ nsk->sk_rcvbuf = osk->sk_rcvbuf;
+ nsk->sk_sndtimeo = osk->sk_sndtimeo;
+ nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
+ nsk->sk_mark = osk->sk_mark;
+ nsk->sk_priority = osk->sk_priority;
+ nsk->sk_rcvlowat = osk->sk_rcvlowat;
+ nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
+ nsk->sk_err = osk->sk_err;
+
+ nsk->sk_flags &= ~mask;
+ nsk->sk_flags |= osk->sk_flags & mask;
+}
+
+#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
+ (1UL << SOCK_KEEPOPEN) | \
+ (1UL << SOCK_LINGER) | \
+ (1UL << SOCK_BROADCAST) | \
+ (1UL << SOCK_TIMESTAMP) | \
+ (1UL << SOCK_DBG) | \
+ (1UL << SOCK_RCVTSTAMP) | \
+ (1UL << SOCK_RCVTSTAMPNS) | \
+ (1UL << SOCK_LOCALROUTE) | \
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
+ (1UL << SOCK_RXQ_OVFL) | \
+ (1UL << SOCK_WIFI_STATUS) | \
+ (1UL << SOCK_NOFCS) | \
+ (1UL << SOCK_FILTER_LOCKED))
+/* copy only relevant settings and flags of SOL_SOCKET level from smc to
+ * clc socket (since smc is not called for these options from net/core)
+ */
+static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
+{
+ smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
+}
+
+#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
+ (1UL << SOCK_KEEPOPEN) | \
+ (1UL << SOCK_LINGER) | \
+ (1UL << SOCK_DBG))
+/* copy only settings and flags relevant for smc from clc to smc socket */
+static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
+{
+ smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
+}
+
+/* determine subnet and mask of internal TCP socket */
+int smc_netinfo_by_tcpsk(struct socket *clcsock,
+ __be32 *subnet, u8 *prefix_len)
+{
+ struct dst_entry *dst = sk_dst_get(clcsock->sk);
+ struct sockaddr_in addr;
+ int rc = -ENOENT;
+ int len;
+
+ if (!dst) {
+ rc = -ENOTCONN;
+ goto out;
+ }
+ if (!dst->dev) {
+ rc = -ENODEV;
+ goto out_rel;
+ }
+
+ /* get address to which the internal TCP socket is bound */
+ kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
+ /* analyze IPv4 specific data of net_device belonging to TCP socket */
+ for_ifa(dst->dev->ip_ptr) {
+ if (ifa->ifa_address != addr.sin_addr.s_addr)
+ continue;
+ *prefix_len = inet_mask_len(ifa->ifa_mask);
+ *subnet = ifa->ifa_address & ifa->ifa_mask;
+ rc = 0;
+ break;
+ } endfor_ifa(dst->dev->ip_ptr);
+
+out_rel:
+ dst_release(dst);
+out:
+ return rc;
+}
+
+static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
+{
+ struct smc_link_group *lgr = smc->conn.lgr;
+ struct smc_link *link;
+ int rest;
+ int rc;
+
+ link = &lgr->lnk[SMC_SINGLE_LINK];
+ /* receive CONFIRM LINK request from server over RoCE fabric */
+ rest = wait_for_completion_interruptible_timeout(
+ &link->llc_confirm,
+ SMC_LLC_WAIT_FIRST_TIME);
+ if (rest <= 0) {
+ struct smc_clc_msg_decline dclc;
+
+ rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+ SMC_CLC_DECLINE);
+ return rc;
+ }
+
+ rc = smc_ib_modify_qp_rts(link);
+ if (rc)
+ return SMC_CLC_DECL_INTERR;
+
+ smc_wr_remember_qp_attr(link);
+ /* send CONFIRM LINK response over RoCE fabric */
+ rc = smc_llc_send_confirm_link(link,
+ link->smcibdev->mac[link->ibport - 1],
+ gid, SMC_LLC_RESP);
+ if (rc < 0)
+ return SMC_CLC_DECL_TCL;
+
+ return rc;
+}
+
+static void smc_conn_save_peer_info(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ smc->conn.peer_conn_idx = clc->conn_idx;
+ smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
+ smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
+ atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
+}
+
+static void smc_link_save_peer_info(struct smc_link *link,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ link->peer_qpn = ntoh24(clc->qpn);
+ memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
+ memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
+ link->peer_psn = ntoh24(clc->psn);
+ link->peer_mtu = clc->qp_mtu;
+}
+
+/* setup for RDMA connection of client */
+static int smc_connect_rdma(struct smc_sock *smc)
+{
+ struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
+ struct smc_clc_msg_accept_confirm aclc;
+ int local_contact = SMC_FIRST_CONTACT;
+ struct smc_ib_device *smcibdev;
+ struct smc_link *link;
+ u8 srv_first_contact;
+ int reason_code = 0;
+ int rc = 0;
+ u8 ibport;
+
+ /* IPSec connections opt out of SMC-R optimizations */
+ if (using_ipsec(smc)) {
+ reason_code = SMC_CLC_DECL_IPSEC;
+ goto decline_rdma;
+ }
+
+ /* PNET table look up: search active ib_device and port
+ * within same PNETID that also contains the ethernet device
+ * used for the internal TCP socket
+ */
+ smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
+ if (!smcibdev) {
+ reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+ goto decline_rdma;
+ }
+
+ /* do inband token exchange */
+ reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
+ if (reason_code < 0) {
+ rc = reason_code;
+ goto out_err;
+ }
+ if (reason_code > 0) /* configuration error */
+ goto decline_rdma;
+ /* receive SMC Accept CLC message */
+ reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
+ SMC_CLC_ACCEPT);
+ if (reason_code < 0) {
+ rc = reason_code;
+ goto out_err;
+ }
+ if (reason_code > 0)
+ goto decline_rdma;
+
+ srv_first_contact = aclc.hdr.flag;
+ mutex_lock(&smc_create_lgr_pending);
+ local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
+ ibport, &aclc.lcl, srv_first_contact);
+ if (local_contact < 0) {
+ rc = local_contact;
+ if (rc == -ENOMEM)
+ reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
+ else if (rc == -ENOLINK)
+ reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
+ goto decline_rdma_unlock;
+ }
+ link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+
+ smc_conn_save_peer_info(smc, &aclc);
+
+ rc = smc_sndbuf_create(smc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_MEM;
+ goto decline_rdma_unlock;
+ }
+ rc = smc_rmb_create(smc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_MEM;
+ goto decline_rdma_unlock;
+ }
+
+ if (local_contact == SMC_FIRST_CONTACT)
+ smc_link_save_peer_info(link, &aclc);
+
+ rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_INTERR;
+ goto decline_rdma_unlock;
+ }
+
+ if (local_contact == SMC_FIRST_CONTACT) {
+ rc = smc_ib_ready_link(link);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_INTERR;
+ goto decline_rdma_unlock;
+ }
+ }
+
+ rc = smc_clc_send_confirm(smc);
+ if (rc)
+ goto out_err_unlock;
+
+ if (local_contact == SMC_FIRST_CONTACT) {
+ /* QP confirmation over RoCE fabric */
+ reason_code = smc_clnt_conf_first_link(
+ smc, &smcibdev->gid[ibport - 1]);
+ if (reason_code < 0) {
+ rc = reason_code;
+ goto out_err_unlock;
+ }
+ if (reason_code > 0)
+ goto decline_rdma_unlock;
+ }
+
+ mutex_unlock(&smc_create_lgr_pending);
+ smc_tx_init(smc);
+ smc_rx_init(smc);
+
+out_connected:
+ smc_copy_sock_settings_to_clc(smc);
+ if (smc->sk.sk_state == SMC_INIT)
+ smc->sk.sk_state = SMC_ACTIVE;
+
+ return rc ? rc : local_contact;
+
+decline_rdma_unlock:
+ mutex_unlock(&smc_create_lgr_pending);
+ smc_conn_free(&smc->conn);
+decline_rdma:
+ /* RDMA setup failed, switch back to TCP */
+ smc->use_fallback = true;
+ if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
+ rc = smc_clc_send_decline(smc, reason_code, 0);
+ if (rc < sizeof(struct smc_clc_msg_decline))
+ goto out_err;
+ }
+ goto out_connected;
+
+out_err_unlock:
+ mutex_unlock(&smc_create_lgr_pending);
+ smc_conn_free(&smc->conn);
+out_err:
+ return rc;
+}
+
+static int smc_connect(struct socket *sock, struct sockaddr *addr,
+ int alen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -EINVAL;
+
+ smc = smc_sk(sk);
+
+ /* separate smc parameter checking to be safe */
+ if (alen < sizeof(addr->sa_family))
+ goto out_err;
+ if (addr->sa_family != AF_INET)
+ goto out_err;
+ smc->addr = addr; /* needed for nonblocking connect */
+
+ lock_sock(sk);
+ switch (sk->sk_state) {
+ default:
+ goto out;
+ case SMC_ACTIVE:
+ rc = -EISCONN;
+ goto out;
+ case SMC_INIT:
+ rc = 0;
+ break;
+ }
+
+ smc_copy_sock_settings_to_clc(smc);
+ rc = kernel_connect(smc->clcsock, addr, alen, flags);
+ if (rc)
+ goto out;
+
+ /* setup RDMA connection */
+ rc = smc_connect_rdma(smc);
+ if (rc < 0)
+ goto out;
+ else
+ rc = 0; /* success cases including fallback */
+
+out:
+ release_sock(sk);
+out_err:
+ return rc;
+}
+
+static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
+{
+ struct sock *sk = &lsmc->sk;
+ struct socket *new_clcsock;
+ struct sock *new_sk;
+ int rc;
+
+ release_sock(&lsmc->sk);
+ new_sk = smc_sock_alloc(sock_net(sk), NULL);
+ if (!new_sk) {
+ rc = -ENOMEM;
+ lsmc->sk.sk_err = ENOMEM;
+ *new_smc = NULL;
+ lock_sock(&lsmc->sk);
+ goto out;
+ }
+ *new_smc = smc_sk(new_sk);
+
+ rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
+ lock_sock(&lsmc->sk);
+ if (rc < 0) {
+ lsmc->sk.sk_err = -rc;
+ new_sk->sk_state = SMC_CLOSED;
+ sock_set_flag(new_sk, SOCK_DEAD);
+ sk->sk_prot->unhash(new_sk);
+ sock_put(new_sk);
+ *new_smc = NULL;
+ goto out;
+ }
+ if (lsmc->sk.sk_state == SMC_CLOSED) {
+ if (new_clcsock)
+ sock_release(new_clcsock);
+ new_sk->sk_state = SMC_CLOSED;
+ sock_set_flag(new_sk, SOCK_DEAD);
+ sk->sk_prot->unhash(new_sk);
+ sock_put(new_sk);
+ *new_smc = NULL;
+ goto out;
+ }
+
+ (*new_smc)->clcsock = new_clcsock;
+out:
+ return rc;
+}
+
+/* add a just created sock to the accept queue of the listen sock as
+ * candidate for a following socket accept call from user space
+ */
+static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
+{
+ struct smc_sock *par = smc_sk(parent);
+
+ sock_hold(sk);
+ spin_lock(&par->accept_q_lock);
+ list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
+ spin_unlock(&par->accept_q_lock);
+ sk_acceptq_added(parent);
+}
+
+/* remove a socket from the accept queue of its parental listening socket */
+static void smc_accept_unlink(struct sock *sk)
+{
+ struct smc_sock *par = smc_sk(sk)->listen_smc;
+
+ spin_lock(&par->accept_q_lock);
+ list_del_init(&smc_sk(sk)->accept_q);
+ spin_unlock(&par->accept_q_lock);
+ sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
+ sock_put(sk);
+}
+
+/* remove a sock from the accept queue to bind it to a new socket created
+ * for a socket accept call from user space
+ */
+struct sock *smc_accept_dequeue(struct sock *parent,
+ struct socket *new_sock)
+{
+ struct smc_sock *isk, *n;
+ struct sock *new_sk;
+
+ list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
+ new_sk = (struct sock *)isk;
+
+ smc_accept_unlink(new_sk);
+ if (new_sk->sk_state == SMC_CLOSED) {
+ /* tbd in follow-on patch: close this sock */
+ continue;
+ }
+ if (new_sock)
+ sock_graft(new_sk, new_sock);
+ return new_sk;
+ }
+ return NULL;
+}
+
+/* clean up for a created but never accepted sock */
+void smc_close_non_accepted(struct sock *sk)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ sock_hold(sk);
+ lock_sock(sk);
+ if (!sk->sk_lingertime)
+ /* wait for peer closing */
+ sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
+ if (!smc->use_fallback)
+ smc_close_active(smc);
+ if (smc->clcsock) {
+ struct socket *tcp;
+
+ tcp = smc->clcsock;
+ smc->clcsock = NULL;
+ sock_release(tcp);
+ }
+ sock_set_flag(sk, SOCK_DEAD);
+ sk->sk_shutdown |= SHUTDOWN_MASK;
+ if (smc->use_fallback) {
+ schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
+ } else {
+ smc_conn_free(&smc->conn);
+ schedule_delayed_work(&smc->sock_put_work,
+ SMC_CLOSE_SOCK_PUT_DELAY);
+ }
+ release_sock(sk);
+ sock_put(sk);
+}
+
+static int smc_serv_conf_first_link(struct smc_sock *smc)
+{
+ struct smc_link_group *lgr = smc->conn.lgr;
+ struct smc_link *link;
+ int rest;
+ int rc;
+
+ link = &lgr->lnk[SMC_SINGLE_LINK];
+ /* send CONFIRM LINK request to client over the RoCE fabric */
+ rc = smc_llc_send_confirm_link(link,
+ link->smcibdev->mac[link->ibport - 1],
+ &link->smcibdev->gid[link->ibport - 1],
+ SMC_LLC_REQ);
+ if (rc < 0)
+ return SMC_CLC_DECL_TCL;
+
+ /* receive CONFIRM LINK response from client over the RoCE fabric */
+ rest = wait_for_completion_interruptible_timeout(
+ &link->llc_confirm_resp,
+ SMC_LLC_WAIT_FIRST_TIME);
+ if (rest <= 0) {
+ struct smc_clc_msg_decline dclc;
+
+ rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+ SMC_CLC_DECLINE);
+ }
+
+ return rc;
+}
+
+/* setup for RDMA connection of server */
+static void smc_listen_work(struct work_struct *work)
+{
+ struct smc_sock *new_smc = container_of(work, struct smc_sock,
+ smc_listen_work);
+ struct socket *newclcsock = new_smc->clcsock;
+ struct smc_sock *lsmc = new_smc->listen_smc;
+ struct smc_clc_msg_accept_confirm cclc;
+ int local_contact = SMC_REUSE_CONTACT;
+ struct sock *newsmcsk = &new_smc->sk;
+ struct smc_clc_msg_proposal pclc;
+ struct smc_ib_device *smcibdev;
+ struct sockaddr_in peeraddr;
+ struct smc_link *link;
+ int reason_code = 0;
+ int rc = 0, len;
+ __be32 subnet;
+ u8 prefix_len;
+ u8 ibport;
+
+ /* do inband token exchange -
+ *wait for and receive SMC Proposal CLC message
+ */
+ reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
+ SMC_CLC_PROPOSAL);
+ if (reason_code < 0)
+ goto out_err;
+ if (reason_code > 0)
+ goto decline_rdma;
+
+ /* IPSec connections opt out of SMC-R optimizations */
+ if (using_ipsec(new_smc)) {
+ reason_code = SMC_CLC_DECL_IPSEC;
+ goto decline_rdma;
+ }
+
+ /* PNET table look up: search active ib_device and port
+ * within same PNETID that also contains the ethernet device
+ * used for the internal TCP socket
+ */
+ smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
+ if (!smcibdev) {
+ reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+ goto decline_rdma;
+ }
+
+ /* determine subnet and mask from internal TCP socket */
+ rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+ goto decline_rdma;
+ }
+ if ((pclc.outgoing_subnet != subnet) ||
+ (pclc.prefix_len != prefix_len)) {
+ reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+ goto decline_rdma;
+ }
+
+ /* get address of the peer connected to the internal TCP socket */
+ kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
+
+ /* allocate connection / link group */
+ mutex_lock(&smc_create_lgr_pending);
+ local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
+ smcibdev, ibport, &pclc.lcl, 0);
+ if (local_contact == SMC_REUSE_CONTACT)
+ /* lock no longer needed, free it due to following
+ * smc_clc_wait_msg() call
+ */
+ mutex_unlock(&smc_create_lgr_pending);
+ if (local_contact < 0) {
+ rc = local_contact;
+ if (rc == -ENOMEM)
+ reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
+ else if (rc == -ENOLINK)
+ reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
+ goto decline_rdma;
+ }
+ link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+
+ rc = smc_sndbuf_create(new_smc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_MEM;
+ goto decline_rdma;
+ }
+ rc = smc_rmb_create(new_smc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_MEM;
+ goto decline_rdma;
+ }
+
+ rc = smc_clc_send_accept(new_smc, local_contact);
+ if (rc)
+ goto out_err;
+
+ /* receive SMC Confirm CLC message */
+ reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
+ SMC_CLC_CONFIRM);
+ if (reason_code < 0)
+ goto out_err;
+ if (reason_code > 0)
+ goto decline_rdma;
+ smc_conn_save_peer_info(new_smc, &cclc);
+ if (local_contact == SMC_FIRST_CONTACT)
+ smc_link_save_peer_info(link, &cclc);
+
+ rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_INTERR;
+ goto decline_rdma;
+ }
+
+ if (local_contact == SMC_FIRST_CONTACT) {
+ rc = smc_ib_ready_link(link);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_INTERR;
+ goto decline_rdma;
+ }
+ /* QP confirmation over RoCE fabric */
+ reason_code = smc_serv_conf_first_link(new_smc);
+ if (reason_code < 0) {
+ /* peer is not aware of a problem */
+ rc = reason_code;
+ goto out_err;
+ }
+ if (reason_code > 0)
+ goto decline_rdma;
+ }
+
+ smc_tx_init(new_smc);
+ smc_rx_init(new_smc);
+
+out_connected:
+ sk_refcnt_debug_inc(newsmcsk);
+ if (newsmcsk->sk_state == SMC_INIT)
+ newsmcsk->sk_state = SMC_ACTIVE;
+enqueue:
+ if (local_contact == SMC_FIRST_CONTACT)
+ mutex_unlock(&smc_create_lgr_pending);
+ lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
+ if (lsmc->sk.sk_state == SMC_LISTEN) {
+ smc_accept_enqueue(&lsmc->sk, newsmcsk);
+ } else { /* no longer listening */
+ smc_close_non_accepted(newsmcsk);
+ }
+ release_sock(&lsmc->sk);
+
+ /* Wake up accept */
+ lsmc->sk.sk_data_ready(&lsmc->sk);
+ sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
+ return;
+
+decline_rdma:
+ /* RDMA setup failed, switch back to TCP */
+ smc_conn_free(&new_smc->conn);
+ new_smc->use_fallback = true;
+ if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
+ rc = smc_clc_send_decline(new_smc, reason_code, 0);
+ if (rc < sizeof(struct smc_clc_msg_decline))
+ goto out_err;
+ }
+ goto out_connected;
+
+out_err:
+ newsmcsk->sk_state = SMC_CLOSED;
+ smc_conn_free(&new_smc->conn);
+ goto enqueue; /* queue new sock with sk_err set */
+}
+
+static void smc_tcp_listen_work(struct work_struct *work)
+{
+ struct smc_sock *lsmc = container_of(work, struct smc_sock,
+ tcp_listen_work);
+ struct smc_sock *new_smc;
+ int rc = 0;
+
+ lock_sock(&lsmc->sk);
+ while (lsmc->sk.sk_state == SMC_LISTEN) {
+ rc = smc_clcsock_accept(lsmc, &new_smc);
+ if (rc)
+ goto out;
+ if (!new_smc)
+ continue;
+
+ new_smc->listen_smc = lsmc;
+ new_smc->use_fallback = false; /* assume rdma capability first*/
+ sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */
+ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
+ smc_copy_sock_settings_to_smc(new_smc);
+ schedule_work(&new_smc->smc_listen_work);
+ }
+
+out:
+ release_sock(&lsmc->sk);
+ lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
+}
+
+static int smc_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+
+ rc = -EINVAL;
+ if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
+ goto out;
+
+ rc = 0;
+ if (sk->sk_state == SMC_LISTEN) {
+ sk->sk_max_ack_backlog = backlog;
+ goto out;
+ }
+ /* some socket options are handled in core, so we could not apply
+ * them to the clc socket -- copy smc socket options to clc socket
+ */
+ smc_copy_sock_settings_to_clc(smc);
+
+ rc = kernel_listen(smc->clcsock, backlog);
+ if (rc)
+ goto out;
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_ack_backlog = 0;
+ sk->sk_state = SMC_LISTEN;
+ INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
+ schedule_work(&smc->tcp_listen_work);
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static int smc_accept(struct socket *sock, struct socket *new_sock,
+ int flags, bool kern)
+{
+ struct sock *sk = sock->sk, *nsk;
+ DECLARE_WAITQUEUE(wait, current);
+ struct smc_sock *lsmc;
+ long timeo;
+ int rc = 0;
+
+ lsmc = smc_sk(sk);
+ lock_sock(sk);
+
+ if (lsmc->sk.sk_state != SMC_LISTEN) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* Wait for an incoming connection */
+ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+ add_wait_queue_exclusive(sk_sleep(sk), &wait);
+ while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!timeo) {
+ rc = -EAGAIN;
+ break;
+ }
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ /* wakeup by sk_data_ready in smc_listen_work() */
+ sched_annotate_sleep();
+ lock_sock(sk);
+ if (signal_pending(current)) {
+ rc = sock_intr_errno(timeo);
+ break;
+ }
+ }
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ if (!rc)
+ rc = sock_error(nsk);
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static int smc_getname(struct socket *sock, struct sockaddr *addr,
+ int *len, int peer)
+{
+ struct smc_sock *smc;
+
+ if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
+ (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
+ return -ENOTCONN;
+
+ smc = smc_sk(sock->sk);
+
+ return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
+}
+
+static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -EPIPE;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+ if ((sk->sk_state != SMC_ACTIVE) &&
+ (sk->sk_state != SMC_APPCLOSEWAIT1) &&
+ (sk->sk_state != SMC_INIT))
+ goto out;
+ if (smc->use_fallback)
+ rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
+ else
+ rc = smc_tx_sendmsg(smc, msg, len);
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -ENOTCONN;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+ if ((sk->sk_state == SMC_INIT) ||
+ (sk->sk_state == SMC_LISTEN) ||
+ (sk->sk_state == SMC_CLOSED))
+ goto out;
+
+ if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
+ rc = 0;
+ goto out;
+ }
+
+ if (smc->use_fallback)
+ rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
+ else
+ rc = smc_rx_recvmsg(smc, msg, len, flags);
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static unsigned int smc_accept_poll(struct sock *parent)
+{
+ struct smc_sock *isk;
+ struct sock *sk;
+
+ lock_sock(parent);
+ list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
+ sk = (struct sock *)isk;
+
+ if (sk->sk_state == SMC_ACTIVE) {
+ release_sock(parent);
+ return POLLIN | POLLRDNORM;
+ }
+ }
+ release_sock(parent);
+
+ return 0;
+}
+
+static unsigned int smc_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ unsigned int mask = 0;
+ struct smc_sock *smc;
+ int rc;
+
+ smc = smc_sk(sock->sk);
+ if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
+ /* delegate to CLC child sock */
+ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
+ /* if non-blocking connect finished ... */
+ lock_sock(sk);
+ if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
+ sk->sk_err = smc->clcsock->sk->sk_err;
+ if (sk->sk_err) {
+ mask |= POLLERR;
+ } else {
+ rc = smc_connect_rdma(smc);
+ if (rc < 0)
+ mask |= POLLERR;
+ else
+ /* success cases including fallback */
+ mask |= POLLOUT | POLLWRNORM;
+ }
+ }
+ release_sock(sk);
+ } else {
+ sock_poll_wait(file, sk_sleep(sk), wait);
+ if (sk->sk_state == SMC_LISTEN)
+ /* woken up by sk_data_ready in smc_listen_work() */
+ mask |= smc_accept_poll(sk);
+ if (sk->sk_err)
+ mask |= POLLERR;
+ if (atomic_read(&smc->conn.sndbuf_space) ||
+ (sk->sk_shutdown & SEND_SHUTDOWN)) {
+ mask |= POLLOUT | POLLWRNORM;
+ } else {
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ }
+ if (atomic_read(&smc->conn.bytes_to_rcv))
+ mask |= POLLIN | POLLRDNORM;
+ if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
+ (sk->sk_state == SMC_CLOSED))
+ mask |= POLLHUP;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+ if (sk->sk_state == SMC_APPCLOSEWAIT1)
+ mask |= POLLIN;
+
+ }
+
+ return mask;
+}
+
+static int smc_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -EINVAL;
+ int rc1 = 0;
+
+ smc = smc_sk(sk);
+
+ if ((how < SHUT_RD) || (how > SHUT_RDWR))
+ return rc;
+
+ lock_sock(sk);
+
+ rc = -ENOTCONN;
+ if ((sk->sk_state != SMC_LISTEN) &&
+ (sk->sk_state != SMC_ACTIVE) &&
+ (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
+ (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
+ (sk->sk_state != SMC_APPCLOSEWAIT1) &&
+ (sk->sk_state != SMC_APPCLOSEWAIT2) &&
+ (sk->sk_state != SMC_APPFINCLOSEWAIT))
+ goto out;
+ if (smc->use_fallback) {
+ rc = kernel_sock_shutdown(smc->clcsock, how);
+ sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ sk->sk_state = SMC_CLOSED;
+ goto out;
+ }
+ switch (how) {
+ case SHUT_RDWR: /* shutdown in both directions */
+ rc = smc_close_active(smc);
+ break;
+ case SHUT_WR:
+ rc = smc_close_shutdown_write(smc);
+ break;
+ case SHUT_RD:
+ if (sk->sk_state == SMC_LISTEN)
+ rc = smc_close_active(smc);
+ else
+ rc = 0;
+ /* nothing more to do because peer is not involved */
+ break;
+ }
+ rc1 = kernel_sock_shutdown(smc->clcsock, how);
+ /* map sock_shutdown_cmd constants to sk_shutdown value range */
+ sk->sk_shutdown |= how + 1;
+
+out:
+ release_sock(sk);
+ return rc ? rc : rc1;
+}
+
+static int smc_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+
+ smc = smc_sk(sk);
+
+ /* generic setsockopts reaching us here always apply to the
+ * CLC socket
+ */
+ return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
+ optval, optlen);
+}
+
+static int smc_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct smc_sock *smc;
+
+ smc = smc_sk(sock->sk);
+ /* socket options apply to the CLC socket */
+ return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
+ optval, optlen);
+}
+
+static int smc_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ struct smc_sock *smc;
+
+ smc = smc_sk(sock->sk);
+ if (smc->use_fallback)
+ return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
+ else
+ return sock_no_ioctl(sock, cmd, arg);
+}
+
+static ssize_t smc_sendpage(struct socket *sock, struct page *page,
+ int offset, size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -EPIPE;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+ if (sk->sk_state != SMC_ACTIVE)
+ goto out;
+ if (smc->use_fallback)
+ rc = kernel_sendpage(smc->clcsock, page, offset,
+ size, flags);
+ else
+ rc = sock_no_sendpage(sock, page, offset, size, flags);
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -ENOTCONN;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+ if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
+ goto out;
+ if (smc->use_fallback) {
+ rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
+ pipe, len, flags);
+ } else {
+ rc = -EOPNOTSUPP;
+ }
+out:
+ release_sock(sk);
+ return rc;
+}
+
+/* must look like tcp */
+static const struct proto_ops smc_sock_ops = {
+ .family = PF_SMC,
+ .owner = THIS_MODULE,
+ .release = smc_release,
+ .bind = smc_bind,
+ .connect = smc_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = smc_accept,
+ .getname = smc_getname,
+ .poll = smc_poll,
+ .ioctl = smc_ioctl,
+ .listen = smc_listen,
+ .shutdown = smc_shutdown,
+ .setsockopt = smc_setsockopt,
+ .getsockopt = smc_getsockopt,
+ .sendmsg = smc_sendmsg,
+ .recvmsg = smc_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = smc_sendpage,
+ .splice_read = smc_splice_read,
+};
+
+static int smc_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct smc_sock *smc;
+ struct sock *sk;
+ int rc;
+
+ rc = -ESOCKTNOSUPPORT;
+ if (sock->type != SOCK_STREAM)
+ goto out;
+
+ rc = -EPROTONOSUPPORT;
+ if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
+ goto out;
+
+ rc = -ENOBUFS;
+ sock->ops = &smc_sock_ops;
+ sk = smc_sock_alloc(net, sock);
+ if (!sk)
+ goto out;
+
+ /* create internal TCP socket for CLC handshake and fallback */
+ smc = smc_sk(sk);
+ smc->use_fallback = false; /* assume rdma capability first */
+ rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
+ IPPROTO_TCP, &smc->clcsock);
+ if (rc)
+ sk_common_release(sk);
+ smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
+ smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
+
+out:
+ return rc;
+}
+
+static const struct net_proto_family smc_sock_family_ops = {
+ .family = PF_SMC,
+ .owner = THIS_MODULE,
+ .create = smc_create,
+};
+
+static int __init smc_init(void)
+{
+ int rc;
+
+ rc = smc_pnet_init();
+ if (rc)
+ return rc;
+
+ rc = smc_llc_init();
+ if (rc) {
+ pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
+ goto out_pnet;
+ }
+
+ rc = smc_cdc_init();
+ if (rc) {
+ pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
+ goto out_pnet;
+ }
+
+ rc = proto_register(&smc_proto, 1);
+ if (rc) {
+ pr_err("%s: proto_register fails with %d\n", __func__, rc);
+ goto out_pnet;
+ }
+
+ rc = sock_register(&smc_sock_family_ops);
+ if (rc) {
+ pr_err("%s: sock_register fails with %d\n", __func__, rc);
+ goto out_proto;
+ }
+ INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
+
+ rc = smc_ib_register_client();
+ if (rc) {
+ pr_err("%s: ib_register fails with %d\n", __func__, rc);
+ goto out_sock;
+ }
+
+ return 0;
+
+out_sock:
+ sock_unregister(PF_SMC);
+out_proto:
+ proto_unregister(&smc_proto);
+out_pnet:
+ smc_pnet_exit();
+ return rc;
+}
+
+static void __exit smc_exit(void)
+{
+ struct smc_link_group *lgr, *lg;
+ LIST_HEAD(lgr_freeing_list);
+
+ spin_lock_bh(&smc_lgr_list.lock);
+ if (!list_empty(&smc_lgr_list.list))
+ list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
+ spin_unlock_bh(&smc_lgr_list.lock);
+ list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
+ list_del_init(&lgr->list);
+ smc_lgr_free(lgr); /* free link group */
+ }
+ smc_ib_unregister_client();
+ sock_unregister(PF_SMC);
+ proto_unregister(&smc_proto);
+ smc_pnet_exit();
+}
+
+module_init(smc_init);
+module_exit(smc_exit);
+
+MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("smc socket address family");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_SMC);
diff --git a/net/smc/smc.h b/net/smc/smc.h
new file mode 100644
index 000000000000..ee5fbea24549
--- /dev/null
+++ b/net/smc/smc.h
@@ -0,0 +1,274 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for the SMC module (socket related)
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+#ifndef __SMC_H
+#define __SMC_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+#include <linux/compiler.h> /* __aligned */
+#include <net/sock.h>
+
+#include "smc_ib.h"
+
+#define SMCPROTO_SMC 0 /* SMC protocol */
+
+#define SMC_MAX_PORTS 2 /* Max # of ports */
+
+extern struct proto smc_proto;
+
+#ifdef ATOMIC64_INIT
+#define KERNEL_HAS_ATOMIC64
+#endif
+
+enum smc_state { /* possible states of an SMC socket */
+ SMC_ACTIVE = 1,
+ SMC_INIT = 2,
+ SMC_CLOSED = 7,
+ SMC_LISTEN = 10,
+ /* normal close */
+ SMC_PEERCLOSEWAIT1 = 20,
+ SMC_PEERCLOSEWAIT2 = 21,
+ SMC_APPFINCLOSEWAIT = 24,
+ SMC_APPCLOSEWAIT1 = 22,
+ SMC_APPCLOSEWAIT2 = 23,
+ SMC_PEERFINCLOSEWAIT = 25,
+ /* abnormal close */
+ SMC_PEERABORTWAIT = 26,
+ SMC_PROCESSABORT = 27,
+};
+
+struct smc_link_group;
+
+struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */
+ u8 type;
+} __aligned(1);
+
+struct smc_cdc_conn_state_flags {
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 peer_done_writing : 1; /* Sending done indicator */
+ u8 peer_conn_closed : 1; /* Peer connection closed indicator */
+ u8 peer_conn_abort : 1; /* Abnormal close indicator */
+ u8 reserved : 5;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 5;
+ u8 peer_conn_abort : 1;
+ u8 peer_conn_closed : 1;
+ u8 peer_done_writing : 1;
+#endif
+};
+
+struct smc_cdc_producer_flags {
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 write_blocked : 1; /* Writing Blocked, no rx buf space */
+ u8 urg_data_pending : 1; /* Urgent Data Pending */
+ u8 urg_data_present : 1; /* Urgent Data Present */
+ u8 cons_curs_upd_req : 1; /* cursor update requested */
+ u8 failover_validation : 1;/* message replay due to failover */
+ u8 reserved : 3;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 3;
+ u8 failover_validation : 1;
+ u8 cons_curs_upd_req : 1;
+ u8 urg_data_present : 1;
+ u8 urg_data_pending : 1;
+ u8 write_blocked : 1;
+#endif
+};
+
+/* in host byte order */
+union smc_host_cursor { /* SMC cursor - an offset in an RMBE */
+ struct {
+ u16 reserved;
+ u16 wrap; /* window wrap sequence number */
+ u32 count; /* cursor (= offset) part */
+ };
+#ifdef KERNEL_HAS_ATOMIC64
+ atomic64_t acurs; /* for atomic processing */
+#else
+ u64 acurs; /* for atomic processing */
+#endif
+} __aligned(8);
+
+/* in host byte order, except for flag bitfields in network byte order */
+struct smc_host_cdc_msg { /* Connection Data Control message */
+ struct smc_wr_rx_hdr common; /* .type = 0xFE */
+ u8 len; /* length = 44 */
+ u16 seqno; /* connection seq # */
+ u32 token; /* alert_token */
+ union smc_host_cursor prod; /* producer cursor */
+ union smc_host_cursor cons; /* consumer cursor,
+ * piggy backed "ack"
+ */
+ struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */
+ struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/
+ u8 reserved[18];
+} __aligned(8);
+
+struct smc_connection {
+ struct rb_node alert_node;
+ struct smc_link_group *lgr; /* link group of connection */
+ u32 alert_token_local; /* unique conn. id */
+ u8 peer_conn_idx; /* from tcp handshake */
+ int peer_rmbe_size; /* size of peer rx buffer */
+ atomic_t peer_rmbe_space;/* remaining free bytes in peer
+ * rmbe
+ */
+ int rtoken_idx; /* idx to peer RMB rkey/addr */
+
+ struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */
+ int sndbuf_size; /* sndbuf size <== sock wmem */
+ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */
+ int rmbe_size; /* RMBE size <== sock rmem */
+ int rmbe_size_short;/* compressed notation */
+ int rmbe_update_limit;
+ /* lower limit for consumer
+ * cursor update
+ */
+
+ struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging
+ * buffer for CDC msg send
+ * .prod cf. TCP snd_nxt
+ * .cons cf. TCP sends ack
+ */
+ union smc_host_cursor tx_curs_prep; /* tx - prepared data
+ * snd_max..wmem_alloc
+ */
+ union smc_host_cursor tx_curs_sent; /* tx - sent data
+ * snd_nxt ?
+ */
+ union smc_host_cursor tx_curs_fin; /* tx - confirmed by peer
+ * snd-wnd-begin ?
+ */
+ atomic_t sndbuf_space; /* remaining space in sndbuf */
+ u16 tx_cdc_seq; /* sequence # for CDC send */
+ spinlock_t send_lock; /* protect wr_sends */
+ struct work_struct tx_work; /* retry of smc_cdc_msg_send */
+
+ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl.
+ * .prod cf. TCP rcv_nxt
+ * .cons cf. TCP snd_una
+ */
+ union smc_host_cursor rx_curs_confirmed; /* confirmed to peer
+ * source of snd_una ?
+ */
+ atomic_t bytes_to_rcv; /* arrived data,
+ * not yet received
+ */
+#ifndef KERNEL_HAS_ATOMIC64
+ spinlock_t acurs_lock; /* protect cursors */
+#endif
+};
+
+struct smc_sock { /* smc sock container */
+ struct sock sk;
+ struct socket *clcsock; /* internal tcp socket */
+ struct smc_connection conn; /* smc connection */
+ struct sockaddr *addr; /* inet connect address */
+ struct smc_sock *listen_smc; /* listen parent */
+ struct work_struct tcp_listen_work;/* handle tcp socket accepts */
+ struct work_struct smc_listen_work;/* prepare new accept socket */
+ struct list_head accept_q; /* sockets to be accepted */
+ spinlock_t accept_q_lock; /* protects accept_q */
+ struct delayed_work sock_put_work; /* final socket freeing */
+ bool use_fallback; /* fallback to tcp */
+ u8 wait_close_tx_prepared : 1;
+ /* shutdown wr or close
+ * started, waiting for unsent
+ * data to be sent
+ */
+};
+
+static inline struct smc_sock *smc_sk(const struct sock *sk)
+{
+ return (struct smc_sock *)sk;
+}
+
+#define SMC_SYSTEMID_LEN 8
+
+extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
+
+/* convert an u32 value into network byte order, store it into a 3 byte field */
+static inline void hton24(u8 *net, u32 host)
+{
+ __be32 t;
+
+ t = cpu_to_be32(host);
+ memcpy(net, ((u8 *)&t) + 1, 3);
+}
+
+/* convert a received 3 byte field into host byte order*/
+static inline u32 ntoh24(u8 *net)
+{
+ __be32 t = 0;
+
+ memcpy(((u8 *)&t) + 1, net, 3);
+ return be32_to_cpu(t);
+}
+
+#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */
+
+#define SMC_RMBE_SIZES 16 /* number of distinct sizes for an RMBE */
+/* theoretically, the RFC states that largest size would be 512K,
+ * i.e. compressed 5 and thus 6 sizes (0..5), despite
+ * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
+ */
+
+/* convert the RMB size into the compressed notation - minimum 16K.
+ * In contrast to plain ilog2, this rounds towards the next power of 2,
+ * so the socket application gets at least its desired sndbuf / rcvbuf size.
+ */
+static inline u8 smc_compress_bufsize(int size)
+{
+ u8 compressed;
+
+ if (size <= SMC_BUF_MIN_SIZE)
+ return 0;
+
+ size = (size - 1) >> 14;
+ compressed = ilog2(size) + 1;
+ if (compressed >= SMC_RMBE_SIZES)
+ compressed = SMC_RMBE_SIZES - 1;
+ return compressed;
+}
+
+/* convert the RMB size from compressed notation into integer */
+static inline int smc_uncompress_bufsize(u8 compressed)
+{
+ u32 size;
+
+ size = 0x00000001 << (((int)compressed) + 14);
+ return (int)size;
+}
+
+#ifdef CONFIG_XFRM
+static inline bool using_ipsec(struct smc_sock *smc)
+{
+ return (smc->clcsock->sk->sk_policy[0] ||
+ smc->clcsock->sk->sk_policy[1]) ? 1 : 0;
+}
+#else
+static inline bool using_ipsec(struct smc_sock *smc)
+{
+ return 0;
+}
+#endif
+
+struct smc_clc_msg_local;
+
+int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet,
+ u8 *prefix_len);
+void smc_conn_free(struct smc_connection *conn);
+int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
+ struct smc_ib_device *smcibdev, u8 ibport,
+ struct smc_clc_msg_local *lcl, int srv_first_contact);
+struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
+void smc_close_non_accepted(struct sock *sk);
+
+#endif /* __SMC_H */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
new file mode 100644
index 000000000000..5a339493872e
--- /dev/null
+++ b/net/smc/smc_cdc.c
@@ -0,0 +1,304 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Connection Data Control (CDC)
+ * handles flow control
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/spinlock.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+#include "smc_cdc.h"
+#include "smc_tx.h"
+#include "smc_rx.h"
+#include "smc_close.h"
+
+/********************************** send *************************************/
+
+struct smc_cdc_tx_pend {
+ struct smc_connection *conn; /* socket connection */
+ union smc_host_cursor cursor; /* tx sndbuf cursor sent */
+ union smc_host_cursor p_cursor; /* rx RMBE cursor produced */
+ u16 ctrl_seq; /* conn. tx sequence # */
+};
+
+/* handler for send/transmission completion of a CDC msg */
+static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
+ struct smc_link *link,
+ enum ib_wc_status wc_status)
+{
+ struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd;
+ struct smc_sock *smc;
+ int diff;
+
+ if (!cdcpend->conn)
+ /* already dismissed */
+ return;
+
+ smc = container_of(cdcpend->conn, struct smc_sock, conn);
+ bh_lock_sock(&smc->sk);
+ if (!wc_status) {
+ diff = smc_curs_diff(cdcpend->conn->sndbuf_size,
+ &cdcpend->conn->tx_curs_fin,
+ &cdcpend->cursor);
+ /* sndbuf_space is decreased in smc_sendmsg */
+ smp_mb__before_atomic();
+ atomic_add(diff, &cdcpend->conn->sndbuf_space);
+ /* guarantee 0 <= sndbuf_space <= sndbuf_size */
+ smp_mb__after_atomic();
+ smc_curs_write(&cdcpend->conn->tx_curs_fin,
+ smc_curs_read(&cdcpend->cursor, cdcpend->conn),
+ cdcpend->conn);
+ }
+ smc_tx_sndbuf_nonfull(smc);
+ if (smc->sk.sk_state != SMC_ACTIVE)
+ /* wake up smc_close_wait_tx_pends() */
+ smc->sk.sk_state_change(&smc->sk);
+ bh_unlock_sock(&smc->sk);
+}
+
+int smc_cdc_get_free_slot(struct smc_link *link,
+ struct smc_wr_buf **wr_buf,
+ struct smc_cdc_tx_pend **pend)
+{
+ return smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,
+ (struct smc_wr_tx_pend_priv **)pend);
+}
+
+static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
+ struct smc_cdc_tx_pend *pend)
+{
+ BUILD_BUG_ON_MSG(
+ sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE,
+ "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)");
+ BUILD_BUG_ON_MSG(
+ offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE,
+ "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
+ BUILD_BUG_ON_MSG(
+ sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
+ "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_cdc_tx_pend)");
+ pend->conn = conn;
+ pend->cursor = conn->tx_curs_sent;
+ pend->p_cursor = conn->local_tx_ctrl.prod;
+ pend->ctrl_seq = conn->tx_cdc_seq;
+}
+
+int smc_cdc_msg_send(struct smc_connection *conn,
+ struct smc_wr_buf *wr_buf,
+ struct smc_cdc_tx_pend *pend)
+{
+ struct smc_link *link;
+ int rc;
+
+ link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+
+ smc_cdc_add_pending_send(conn, pend);
+
+ conn->tx_cdc_seq++;
+ conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
+ smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf,
+ &conn->local_tx_ctrl, conn);
+ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
+ if (!rc)
+ smc_curs_write(&conn->rx_curs_confirmed,
+ smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+ conn);
+
+ return rc;
+}
+
+int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
+{
+ struct smc_cdc_tx_pend *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
+ &pend);
+ if (rc)
+ return rc;
+
+ return smc_cdc_msg_send(conn, wr_buf, pend);
+}
+
+static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend,
+ unsigned long data)
+{
+ struct smc_connection *conn = (struct smc_connection *)data;
+ struct smc_cdc_tx_pend *cdc_pend =
+ (struct smc_cdc_tx_pend *)tx_pend;
+
+ return cdc_pend->conn == conn;
+}
+
+static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend)
+{
+ struct smc_cdc_tx_pend *cdc_pend =
+ (struct smc_cdc_tx_pend *)tx_pend;
+
+ cdc_pend->conn = NULL;
+}
+
+void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
+{
+ struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+
+ smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE,
+ smc_cdc_tx_filter, smc_cdc_tx_dismisser,
+ (unsigned long)conn);
+}
+
+bool smc_cdc_tx_has_pending(struct smc_connection *conn)
+{
+ struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+
+ return smc_wr_tx_has_pending(link, SMC_CDC_MSG_TYPE,
+ smc_cdc_tx_filter, (unsigned long)conn);
+}
+
+/********************************* receive ***********************************/
+
+static inline bool smc_cdc_before(u16 seq1, u16 seq2)
+{
+ return (s16)(seq1 - seq2) < 0;
+}
+
+static void smc_cdc_msg_recv_action(struct smc_sock *smc,
+ struct smc_link *link,
+ struct smc_cdc_msg *cdc)
+{
+ union smc_host_cursor cons_old, prod_old;
+ struct smc_connection *conn = &smc->conn;
+ int diff_cons, diff_prod;
+
+ if (!cdc->prod_flags.failover_validation) {
+ if (smc_cdc_before(ntohs(cdc->seqno),
+ conn->local_rx_ctrl.seqno))
+ /* received seqno is old */
+ return;
+ }
+ smc_curs_write(&prod_old,
+ smc_curs_read(&conn->local_rx_ctrl.prod, conn),
+ conn);
+ smc_curs_write(&cons_old,
+ smc_curs_read(&conn->local_rx_ctrl.cons, conn),
+ conn);
+ smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn);
+
+ diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old,
+ &conn->local_rx_ctrl.cons);
+ if (diff_cons) {
+ /* peer_rmbe_space is decreased during data transfer with RDMA
+ * write
+ */
+ smp_mb__before_atomic();
+ atomic_add(diff_cons, &conn->peer_rmbe_space);
+ /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
+ smp_mb__after_atomic();
+ }
+
+ diff_prod = smc_curs_diff(conn->rmbe_size, &prod_old,
+ &conn->local_rx_ctrl.prod);
+ if (diff_prod) {
+ /* bytes_to_rcv is decreased in smc_recvmsg */
+ smp_mb__before_atomic();
+ atomic_add(diff_prod, &conn->bytes_to_rcv);
+ /* guarantee 0 <= bytes_to_rcv <= rmbe_size */
+ smp_mb__after_atomic();
+ smc->sk.sk_data_ready(&smc->sk);
+ }
+
+ if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
+ smc->sk.sk_err = ECONNRESET;
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ }
+ if (smc_cdc_rxed_any_close_or_senddone(conn))
+ smc_close_passive_received(smc);
+
+ /* piggy backed tx info */
+ /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
+ if (diff_cons && smc_tx_prepared_sends(conn)) {
+ smc_tx_sndbuf_nonempty(conn);
+ /* trigger socket release if connection closed */
+ smc_close_wake_tx_prepared(smc);
+ }
+
+ /* subsequent patch: trigger socket release if connection closed */
+
+ /* socket connected but not accepted */
+ if (!smc->sk.sk_socket)
+ return;
+
+ /* data available */
+ if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
+ (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req))
+ smc_tx_consumer_update(conn);
+}
+
+/* called under tasklet context */
+static inline void smc_cdc_msg_recv(struct smc_cdc_msg *cdc,
+ struct smc_link *link, u64 wr_id)
+{
+ struct smc_link_group *lgr = container_of(link, struct smc_link_group,
+ lnk[SMC_SINGLE_LINK]);
+ struct smc_connection *connection;
+ struct smc_sock *smc;
+
+ /* lookup connection */
+ read_lock_bh(&lgr->conns_lock);
+ connection = smc_lgr_find_conn(ntohl(cdc->token), lgr);
+ if (!connection) {
+ read_unlock_bh(&lgr->conns_lock);
+ return;
+ }
+ smc = container_of(connection, struct smc_sock, conn);
+ sock_hold(&smc->sk);
+ read_unlock_bh(&lgr->conns_lock);
+ bh_lock_sock(&smc->sk);
+ smc_cdc_msg_recv_action(smc, link, cdc);
+ bh_unlock_sock(&smc->sk);
+ sock_put(&smc->sk); /* no free sk in softirq-context */
+}
+
+/***************************** init, exit, misc ******************************/
+
+static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
+{
+ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+ struct smc_cdc_msg *cdc = buf;
+
+ if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved))
+ return; /* short message */
+ if (cdc->len != sizeof(*cdc))
+ return; /* invalid message */
+ smc_cdc_msg_recv(cdc, link, wc->wr_id);
+}
+
+static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = {
+ {
+ .handler = smc_cdc_rx_handler,
+ .type = SMC_CDC_MSG_TYPE
+ },
+ {
+ .handler = NULL,
+ }
+};
+
+int __init smc_cdc_init(void)
+{
+ struct smc_wr_rx_handler *handler;
+ int rc = 0;
+
+ for (handler = smc_cdc_rx_handlers; handler->handler; handler++) {
+ INIT_HLIST_NODE(&handler->list);
+ rc = smc_wr_rx_register_handler(handler);
+ if (rc)
+ break;
+ }
+ return rc;
+}
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
new file mode 100644
index 000000000000..8e1d76f26007
--- /dev/null
+++ b/net/smc/smc_cdc.h
@@ -0,0 +1,218 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Connection Data Control (CDC)
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_CDC_H
+#define SMC_CDC_H
+
+#include <linux/kernel.h> /* max_t */
+#include <linux/atomic.h>
+#include <linux/in.h>
+#include <linux/compiler.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_wr.h"
+
+#define SMC_CDC_MSG_TYPE 0xFE
+
+/* in network byte order */
+union smc_cdc_cursor { /* SMC cursor */
+ struct {
+ __be16 reserved;
+ __be16 wrap;
+ __be32 count;
+ };
+#ifdef KERNEL_HAS_ATOMIC64
+ atomic64_t acurs; /* for atomic processing */
+#else
+ u64 acurs; /* for atomic processing */
+#endif
+} __aligned(8);
+
+/* in network byte order */
+struct smc_cdc_msg {
+ struct smc_wr_rx_hdr common; /* .type = 0xFE */
+ u8 len; /* 44 */
+ __be16 seqno;
+ __be32 token;
+ union smc_cdc_cursor prod;
+ union smc_cdc_cursor cons; /* piggy backed "ack" */
+ struct smc_cdc_producer_flags prod_flags;
+ struct smc_cdc_conn_state_flags conn_state_flags;
+ u8 reserved[18];
+} __aligned(8);
+
+static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
+{
+ return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort ||
+ conn->local_rx_ctrl.conn_state_flags.peer_conn_closed;
+}
+
+static inline bool smc_cdc_rxed_any_close_or_senddone(
+ struct smc_connection *conn)
+{
+ return smc_cdc_rxed_any_close(conn) ||
+ conn->local_rx_ctrl.conn_state_flags.peer_done_writing;
+}
+
+static inline void smc_curs_add(int size, union smc_host_cursor *curs,
+ int value)
+{
+ curs->count += value;
+ if (curs->count >= size) {
+ curs->wrap++;
+ curs->count -= size;
+ }
+}
+
+/* SMC cursors are 8 bytes long and require atomic reading and writing */
+static inline u64 smc_curs_read(union smc_host_cursor *curs,
+ struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+ unsigned long flags;
+ u64 ret;
+
+ spin_lock_irqsave(&conn->acurs_lock, flags);
+ ret = curs->acurs;
+ spin_unlock_irqrestore(&conn->acurs_lock, flags);
+ return ret;
+#else
+ return atomic64_read(&curs->acurs);
+#endif
+}
+
+static inline u64 smc_curs_read_net(union smc_cdc_cursor *curs,
+ struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+ unsigned long flags;
+ u64 ret;
+
+ spin_lock_irqsave(&conn->acurs_lock, flags);
+ ret = curs->acurs;
+ spin_unlock_irqrestore(&conn->acurs_lock, flags);
+ return ret;
+#else
+ return atomic64_read(&curs->acurs);
+#endif
+}
+
+static inline void smc_curs_write(union smc_host_cursor *curs, u64 val,
+ struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->acurs_lock, flags);
+ curs->acurs = val;
+ spin_unlock_irqrestore(&conn->acurs_lock, flags);
+#else
+ atomic64_set(&curs->acurs, val);
+#endif
+}
+
+static inline void smc_curs_write_net(union smc_cdc_cursor *curs, u64 val,
+ struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->acurs_lock, flags);
+ curs->acurs = val;
+ spin_unlock_irqrestore(&conn->acurs_lock, flags);
+#else
+ atomic64_set(&curs->acurs, val);
+#endif
+}
+
+/* calculate cursor difference between old and new, where old <= new */
+static inline int smc_curs_diff(unsigned int size,
+ union smc_host_cursor *old,
+ union smc_host_cursor *new)
+{
+ if (old->wrap != new->wrap)
+ return max_t(int, 0,
+ ((size - old->count) + new->count));
+
+ return max_t(int, 0, (new->count - old->count));
+}
+
+static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
+ union smc_host_cursor *local,
+ struct smc_connection *conn)
+{
+ union smc_host_cursor temp;
+
+ smc_curs_write(&temp, smc_curs_read(local, conn), conn);
+ peer->count = htonl(temp.count);
+ peer->wrap = htons(temp.wrap);
+ /* peer->reserved = htons(0); must be ensured by caller */
+}
+
+static inline void smc_host_msg_to_cdc(struct smc_cdc_msg *peer,
+ struct smc_host_cdc_msg *local,
+ struct smc_connection *conn)
+{
+ peer->common.type = local->common.type;
+ peer->len = local->len;
+ peer->seqno = htons(local->seqno);
+ peer->token = htonl(local->token);
+ smc_host_cursor_to_cdc(&peer->prod, &local->prod, conn);
+ smc_host_cursor_to_cdc(&peer->cons, &local->cons, conn);
+ peer->prod_flags = local->prod_flags;
+ peer->conn_state_flags = local->conn_state_flags;
+}
+
+static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local,
+ union smc_cdc_cursor *peer,
+ struct smc_connection *conn)
+{
+ union smc_host_cursor temp, old;
+ union smc_cdc_cursor net;
+
+ smc_curs_write(&old, smc_curs_read(local, conn), conn);
+ smc_curs_write_net(&net, smc_curs_read_net(peer, conn), conn);
+ temp.count = ntohl(net.count);
+ temp.wrap = ntohs(net.wrap);
+ if ((old.wrap > temp.wrap) && temp.wrap)
+ return;
+ if ((old.wrap == temp.wrap) &&
+ (old.count > temp.count))
+ return;
+ smc_curs_write(local, smc_curs_read(&temp, conn), conn);
+}
+
+static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
+ struct smc_cdc_msg *peer,
+ struct smc_connection *conn)
+{
+ local->common.type = peer->common.type;
+ local->len = peer->len;
+ local->seqno = ntohs(peer->seqno);
+ local->token = ntohl(peer->token);
+ smc_cdc_cursor_to_host(&local->prod, &peer->prod, conn);
+ smc_cdc_cursor_to_host(&local->cons, &peer->cons, conn);
+ local->prod_flags = peer->prod_flags;
+ local->conn_state_flags = peer->conn_state_flags;
+}
+
+struct smc_cdc_tx_pend;
+
+int smc_cdc_get_free_slot(struct smc_link *link, struct smc_wr_buf **wr_buf,
+ struct smc_cdc_tx_pend **pend);
+void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
+int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
+ struct smc_cdc_tx_pend *pend);
+int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
+bool smc_cdc_tx_has_pending(struct smc_connection *conn);
+int smc_cdc_init(void) __init;
+
+#endif /* SMC_CDC_H */
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
new file mode 100644
index 000000000000..e41f594a1e1d
--- /dev/null
+++ b/net/smc/smc_clc.c
@@ -0,0 +1,282 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * CLC (connection layer control) handshake over initial TCP socket to
+ * prepare for RDMA traffic
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/sched/signal.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_clc.h"
+#include "smc_ib.h"
+
+/* Wait for data on the tcp-socket, analyze received data
+ * Returns:
+ * 0 if success and it was not a decline that we received.
+ * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send.
+ * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise.
+ */
+int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
+ u8 expected_type)
+{
+ struct sock *clc_sk = smc->clcsock->sk;
+ struct smc_clc_msg_hdr *clcm = buf;
+ struct msghdr msg = {NULL, 0};
+ int reason_code = 0;
+ struct kvec vec;
+ int len, datlen;
+ int krflags;
+
+ /* peek the first few bytes to determine length of data to receive
+ * so we don't consume any subsequent CLC message or payload data
+ * in the TCP byte stream
+ */
+ vec.iov_base = buf;
+ vec.iov_len = buflen;
+ krflags = MSG_PEEK | MSG_WAITALL;
+ smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
+ len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1,
+ sizeof(struct smc_clc_msg_hdr), krflags);
+ if (signal_pending(current)) {
+ reason_code = -EINTR;
+ clc_sk->sk_err = EINTR;
+ smc->sk.sk_err = EINTR;
+ goto out;
+ }
+ if (clc_sk->sk_err) {
+ reason_code = -clc_sk->sk_err;
+ smc->sk.sk_err = clc_sk->sk_err;
+ goto out;
+ }
+ if (!len) { /* peer has performed orderly shutdown */
+ smc->sk.sk_err = ECONNRESET;
+ reason_code = -ECONNRESET;
+ goto out;
+ }
+ if (len < 0) {
+ smc->sk.sk_err = -len;
+ reason_code = len;
+ goto out;
+ }
+ datlen = ntohs(clcm->length);
+ if ((len < sizeof(struct smc_clc_msg_hdr)) ||
+ (datlen < sizeof(struct smc_clc_msg_decline)) ||
+ (datlen > sizeof(struct smc_clc_msg_accept_confirm)) ||
+ memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) ||
+ ((clcm->type != SMC_CLC_DECLINE) &&
+ (clcm->type != expected_type))) {
+ smc->sk.sk_err = EPROTO;
+ reason_code = -EPROTO;
+ goto out;
+ }
+
+ /* receive the complete CLC message */
+ vec.iov_base = buf;
+ vec.iov_len = buflen;
+ memset(&msg, 0, sizeof(struct msghdr));
+ krflags = MSG_WAITALL;
+ smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
+ len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags);
+ if (len < datlen) {
+ smc->sk.sk_err = EPROTO;
+ reason_code = -EPROTO;
+ goto out;
+ }
+ if (clcm->type == SMC_CLC_DECLINE) {
+ reason_code = SMC_CLC_DECL_REPLY;
+ if (ntohl(((struct smc_clc_msg_decline *)buf)->peer_diagnosis)
+ == SMC_CLC_DECL_SYNCERR)
+ smc->conn.lgr->sync_err = true;
+ }
+
+out:
+ return reason_code;
+}
+
+/* send CLC DECLINE message across internal TCP socket */
+int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,
+ u8 out_of_sync)
+{
+ struct smc_clc_msg_decline dclc;
+ struct msghdr msg;
+ struct kvec vec;
+ int len;
+
+ memset(&dclc, 0, sizeof(dclc));
+ memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ dclc.hdr.type = SMC_CLC_DECLINE;
+ dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
+ dclc.hdr.version = SMC_CLC_V1;
+ dclc.hdr.flag = out_of_sync ? 1 : 0;
+ memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid));
+ dclc.peer_diagnosis = htonl(peer_diag_info);
+ memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+ memset(&msg, 0, sizeof(msg));
+ vec.iov_base = &dclc;
+ vec.iov_len = sizeof(struct smc_clc_msg_decline);
+ len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
+ sizeof(struct smc_clc_msg_decline));
+ if (len < sizeof(struct smc_clc_msg_decline))
+ smc->sk.sk_err = EPROTO;
+ if (len < 0)
+ smc->sk.sk_err = -len;
+ return len;
+}
+
+/* send CLC PROPOSAL message across internal TCP socket */
+int smc_clc_send_proposal(struct smc_sock *smc,
+ struct smc_ib_device *smcibdev,
+ u8 ibport)
+{
+ struct smc_clc_msg_proposal pclc;
+ int reason_code = 0;
+ struct msghdr msg;
+ struct kvec vec;
+ int len, rc;
+
+ /* send SMC Proposal CLC message */
+ memset(&pclc, 0, sizeof(pclc));
+ memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ pclc.hdr.type = SMC_CLC_PROPOSAL;
+ pclc.hdr.length = htons(sizeof(pclc));
+ pclc.hdr.version = SMC_CLC_V1; /* SMC version */
+ memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
+ memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE);
+ memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN);
+
+ /* determine subnet and mask from internal TCP socket */
+ rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet,
+ &pclc.prefix_len);
+ if (rc)
+ return SMC_CLC_DECL_CNFERR; /* configuration error */
+ memcpy(pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ memset(&msg, 0, sizeof(msg));
+ vec.iov_base = &pclc;
+ vec.iov_len = sizeof(pclc);
+ /* due to the few bytes needed for clc-handshake this cannot block */
+ len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc));
+ if (len < sizeof(pclc)) {
+ if (len >= 0) {
+ reason_code = -ENETUNREACH;
+ smc->sk.sk_err = -reason_code;
+ } else {
+ smc->sk.sk_err = smc->clcsock->sk->sk_err;
+ reason_code = -smc->sk.sk_err;
+ }
+ }
+
+ return reason_code;
+}
+
+/* send CLC CONFIRM message across internal TCP socket */
+int smc_clc_send_confirm(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct smc_clc_msg_accept_confirm cclc;
+ struct smc_link *link;
+ int reason_code = 0;
+ struct msghdr msg;
+ struct kvec vec;
+ int len;
+
+ link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+ /* send SMC Confirm CLC msg */
+ memset(&cclc, 0, sizeof(cclc));
+ memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ cclc.hdr.type = SMC_CLC_CONFIRM;
+ cclc.hdr.length = htons(sizeof(cclc));
+ cclc.hdr.version = SMC_CLC_V1; /* SMC version */
+ memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
+ memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
+ SMC_GID_SIZE);
+ memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
+ hton24(cclc.qpn, link->roce_qp->qp_num);
+ cclc.rmb_rkey =
+ htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
+ cclc.rmbe_alert_token = htonl(conn->alert_token_local);
+ cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
+ cclc.rmbe_size = conn->rmbe_size_short;
+ cclc.rmb_dma_addr =
+ cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]);
+ hton24(cclc.psn, link->psn_initial);
+
+ memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+ memset(&msg, 0, sizeof(msg));
+ vec.iov_base = &cclc;
+ vec.iov_len = sizeof(cclc);
+ len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc));
+ if (len < sizeof(cclc)) {
+ if (len >= 0) {
+ reason_code = -ENETUNREACH;
+ smc->sk.sk_err = -reason_code;
+ } else {
+ smc->sk.sk_err = smc->clcsock->sk->sk_err;
+ reason_code = -smc->sk.sk_err;
+ }
+ }
+ return reason_code;
+}
+
+/* send CLC ACCEPT message across internal TCP socket */
+int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
+{
+ struct smc_connection *conn = &new_smc->conn;
+ struct smc_clc_msg_accept_confirm aclc;
+ struct smc_link *link;
+ struct msghdr msg;
+ struct kvec vec;
+ int rc = 0;
+ int len;
+
+ link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+ memset(&aclc, 0, sizeof(aclc));
+ memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ aclc.hdr.type = SMC_CLC_ACCEPT;
+ aclc.hdr.length = htons(sizeof(aclc));
+ aclc.hdr.version = SMC_CLC_V1; /* SMC version */
+ if (srv_first_contact)
+ aclc.hdr.flag = 1;
+ memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
+ memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
+ SMC_GID_SIZE);
+ memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
+ hton24(aclc.qpn, link->roce_qp->qp_num);
+ aclc.rmb_rkey =
+ htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */
+ aclc.rmbe_alert_token = htonl(conn->alert_token_local);
+ aclc.qp_mtu = link->path_mtu;
+ aclc.rmbe_size = conn->rmbe_size_short,
+ aclc.rmb_dma_addr =
+ cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]);
+ hton24(aclc.psn, link->psn_initial);
+ memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+ memset(&msg, 0, sizeof(msg));
+ vec.iov_base = &aclc;
+ vec.iov_len = sizeof(aclc);
+ len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc));
+ if (len < sizeof(aclc)) {
+ if (len >= 0)
+ new_smc->sk.sk_err = EPROTO;
+ else
+ new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err;
+ rc = sock_error(&new_smc->sk);
+ }
+
+ return rc;
+}
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
new file mode 100644
index 000000000000..13db8ce177c9
--- /dev/null
+++ b/net/smc/smc_clc.h
@@ -0,0 +1,116 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * CLC (connection layer control) handshake over initial TCP socket to
+ * prepare for RDMA traffic
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_CLC_H
+#define _SMC_CLC_H
+
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+
+#define SMC_CLC_PROPOSAL 0x01
+#define SMC_CLC_ACCEPT 0x02
+#define SMC_CLC_CONFIRM 0x03
+#define SMC_CLC_DECLINE 0x04
+
+/* eye catcher "SMCR" EBCDIC for CLC messages */
+static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
+
+#define SMC_CLC_V1 0x1 /* SMC version */
+#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */
+#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */
+#define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */
+#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */
+#define SMC_CLC_DECL_IPSEC 0x03030000 /* IPsec usage */
+#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */
+#define SMC_CLC_DECL_REPLY 0x06000000 /* reply to a received decline */
+#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */
+#define SMC_CLC_DECL_TCL 0x02040000 /* timeout w4 QP confirm */
+#define SMC_CLC_DECL_SEND 0x07000000 /* sending problem */
+
+struct smc_clc_msg_hdr { /* header1 of clc messages */
+ u8 eyecatcher[4]; /* eye catcher */
+ u8 type; /* proposal / accept / confirm / decline */
+ __be16 length;
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 version : 4,
+ flag : 1,
+ rsvd : 3;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 rsvd : 3,
+ flag : 1,
+ version : 4;
+#endif
+} __packed; /* format defined in RFC7609 */
+
+struct smc_clc_msg_trail { /* trailer of clc messages */
+ u8 eyecatcher[4];
+};
+
+struct smc_clc_msg_local { /* header2 of clc messages */
+ u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */
+ u8 gid[16]; /* gid of ib_device port */
+ u8 mac[6]; /* mac of ib_device port */
+};
+
+struct smc_clc_msg_proposal { /* clc proposal message */
+ struct smc_clc_msg_hdr hdr;
+ struct smc_clc_msg_local lcl;
+ __be16 iparea_offset; /* offset to IP address information area */
+ __be32 outgoing_subnet; /* subnet mask */
+ u8 prefix_len; /* number of significant bits in mask */
+ u8 reserved[2];
+ u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */
+ struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+} __aligned(4);
+
+struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
+ struct smc_clc_msg_hdr hdr;
+ struct smc_clc_msg_local lcl;
+ u8 qpn[3]; /* QP number */
+ __be32 rmb_rkey; /* RMB rkey */
+ u8 conn_idx; /* Connection index, which RMBE in RMB */
+ __be32 rmbe_alert_token;/* unique connection id */
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */
+ qp_mtu : 4; /* QP mtu */
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 qp_mtu : 4,
+ rmbe_size : 4;
+#endif
+ u8 reserved;
+ __be64 rmb_dma_addr; /* RMB virtual address */
+ u8 reserved2;
+ u8 psn[3]; /* initial packet sequence number */
+ struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+} __packed; /* format defined in RFC7609 */
+
+struct smc_clc_msg_decline { /* clc decline message */
+ struct smc_clc_msg_hdr hdr;
+ u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
+ __be32 peer_diagnosis; /* diagnosis information */
+ u8 reserved2[4];
+ struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+} __aligned(4);
+
+struct smc_sock;
+struct smc_ib_device;
+
+int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
+ u8 expected_type);
+int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,
+ u8 out_of_sync);
+int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev,
+ u8 ibport);
+int smc_clc_send_confirm(struct smc_sock *smc);
+int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact);
+
+#endif
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
new file mode 100644
index 000000000000..67a71d170bed
--- /dev/null
+++ b/net/smc/smc_close.c
@@ -0,0 +1,444 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Socket Closing - normal and abnormal
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/workqueue.h>
+#include <linux/sched/signal.h>
+
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_tx.h"
+#include "smc_cdc.h"
+#include "smc_close.h"
+
+#define SMC_CLOSE_WAIT_TX_PENDS_TIME (5 * HZ)
+
+static void smc_close_cleanup_listen(struct sock *parent)
+{
+ struct sock *sk;
+
+ /* Close non-accepted connections */
+ while ((sk = smc_accept_dequeue(parent, NULL)))
+ smc_close_non_accepted(sk);
+}
+
+static void smc_close_wait_tx_pends(struct smc_sock *smc)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct sock *sk = &smc->sk;
+ signed long timeout;
+
+ timeout = SMC_CLOSE_WAIT_TX_PENDS_TIME;
+ add_wait_queue(sk_sleep(sk), &wait);
+ while (!signal_pending(current) && timeout) {
+ int rc;
+
+ rc = sk_wait_event(sk, &timeout,
+ !smc_cdc_tx_has_pending(&smc->conn),
+ &wait);
+ if (rc)
+ break;
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+}
+
+/* wait for sndbuf data being transmitted */
+static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct sock *sk = &smc->sk;
+
+ if (!timeout)
+ return;
+
+ if (!smc_tx_prepared_sends(&smc->conn))
+ return;
+
+ smc->wait_close_tx_prepared = 1;
+ add_wait_queue(sk_sleep(sk), &wait);
+ while (!signal_pending(current) && timeout) {
+ int rc;
+
+ rc = sk_wait_event(sk, &timeout,
+ !smc_tx_prepared_sends(&smc->conn) ||
+ (sk->sk_err == ECONNABORTED) ||
+ (sk->sk_err == ECONNRESET),
+ &wait);
+ if (rc)
+ break;
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+ smc->wait_close_tx_prepared = 0;
+}
+
+void smc_close_wake_tx_prepared(struct smc_sock *smc)
+{
+ if (smc->wait_close_tx_prepared)
+ /* wake up socket closing */
+ smc->sk.sk_state_change(&smc->sk);
+}
+
+static int smc_close_wr(struct smc_connection *conn)
+{
+ conn->local_tx_ctrl.conn_state_flags.peer_done_writing = 1;
+
+ return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+static int smc_close_final(struct smc_connection *conn)
+{
+ if (atomic_read(&conn->bytes_to_rcv))
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ else
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1;
+
+ return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+static int smc_close_abort(struct smc_connection *conn)
+{
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+
+ return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+/* terminate smc socket abnormally - active abort
+ * RDMA communication no longer possible
+ */
+void smc_close_active_abort(struct smc_sock *smc)
+{
+ struct smc_cdc_conn_state_flags *txflags =
+ &smc->conn.local_tx_ctrl.conn_state_flags;
+
+ bh_lock_sock(&smc->sk);
+ smc->sk.sk_err = ECONNABORTED;
+ if (smc->clcsock && smc->clcsock->sk) {
+ smc->clcsock->sk->sk_err = ECONNABORTED;
+ smc->clcsock->sk->sk_state_change(smc->clcsock->sk);
+ }
+ switch (smc->sk.sk_state) {
+ case SMC_INIT:
+ smc->sk.sk_state = SMC_PEERABORTWAIT;
+ break;
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ txflags->peer_conn_abort = 1;
+ sock_release(smc->clcsock);
+ if (!smc_cdc_rxed_any_close(&smc->conn))
+ smc->sk.sk_state = SMC_PEERABORTWAIT;
+ else
+ smc->sk.sk_state = SMC_CLOSED;
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ if (!txflags->peer_conn_closed) {
+ smc->sk.sk_state = SMC_PEERABORTWAIT;
+ txflags->peer_conn_abort = 1;
+ sock_release(smc->clcsock);
+ } else {
+ smc->sk.sk_state = SMC_CLOSED;
+ }
+ break;
+ case SMC_PROCESSABORT:
+ case SMC_APPFINCLOSEWAIT:
+ if (!txflags->peer_conn_closed) {
+ txflags->peer_conn_abort = 1;
+ sock_release(smc->clcsock);
+ }
+ smc->sk.sk_state = SMC_CLOSED;
+ break;
+ case SMC_PEERFINCLOSEWAIT:
+ case SMC_PEERABORTWAIT:
+ case SMC_CLOSED:
+ break;
+ }
+
+ sock_set_flag(&smc->sk, SOCK_DEAD);
+ bh_unlock_sock(&smc->sk);
+ smc->sk.sk_state_change(&smc->sk);
+}
+
+int smc_close_active(struct smc_sock *smc)
+{
+ struct smc_cdc_conn_state_flags *txflags =
+ &smc->conn.local_tx_ctrl.conn_state_flags;
+ long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ int old_state;
+ int rc = 0;
+
+ if (sock_flag(sk, SOCK_LINGER) &&
+ !(current->flags & PF_EXITING))
+ timeout = sk->sk_lingertime;
+
+again:
+ old_state = sk->sk_state;
+ switch (old_state) {
+ case SMC_INIT:
+ sk->sk_state = SMC_CLOSED;
+ if (smc->smc_listen_work.func)
+ flush_work(&smc->smc_listen_work);
+ sock_put(sk);
+ break;
+ case SMC_LISTEN:
+ sk->sk_state = SMC_CLOSED;
+ sk->sk_state_change(sk); /* wake up accept */
+ if (smc->clcsock && smc->clcsock->sk) {
+ rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
+ /* wake up kernel_accept of smc_tcp_listen_worker */
+ smc->clcsock->sk->sk_data_ready(smc->clcsock->sk);
+ }
+ release_sock(sk);
+ smc_close_cleanup_listen(sk);
+ flush_work(&smc->tcp_listen_work);
+ lock_sock(sk);
+ break;
+ case SMC_ACTIVE:
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ if (sk->sk_state == SMC_ACTIVE) {
+ /* send close request */
+ rc = smc_close_final(conn);
+ sk->sk_state = SMC_PEERCLOSEWAIT1;
+ } else {
+ /* peer event has changed the state */
+ goto again;
+ }
+ break;
+ case SMC_APPFINCLOSEWAIT:
+ /* socket already shutdown wr or both (active close) */
+ if (txflags->peer_done_writing &&
+ !txflags->peer_conn_closed) {
+ /* just shutdown wr done, send close request */
+ rc = smc_close_final(conn);
+ }
+ sk->sk_state = SMC_CLOSED;
+ smc_close_wait_tx_pends(smc);
+ break;
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ if (!smc_cdc_rxed_any_close(conn))
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ if (sk->sk_err != ECONNABORTED) {
+ /* confirm close from peer */
+ rc = smc_close_final(conn);
+ if (rc)
+ break;
+ }
+ if (smc_cdc_rxed_any_close(conn))
+ /* peer has closed the socket already */
+ sk->sk_state = SMC_CLOSED;
+ else
+ /* peer has just issued a shutdown write */
+ sk->sk_state = SMC_PEERFINCLOSEWAIT;
+ smc_close_wait_tx_pends(smc);
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ case SMC_PEERFINCLOSEWAIT:
+ /* peer sending PeerConnectionClosed will cause transition */
+ break;
+ case SMC_PROCESSABORT:
+ cancel_work_sync(&conn->tx_work);
+ smc_close_abort(conn);
+ sk->sk_state = SMC_CLOSED;
+ smc_close_wait_tx_pends(smc);
+ break;
+ case SMC_PEERABORTWAIT:
+ case SMC_CLOSED:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+
+ if (old_state != sk->sk_state)
+ sk->sk_state_change(&smc->sk);
+ return rc;
+}
+
+static void smc_close_passive_abort_received(struct smc_sock *smc)
+{
+ struct smc_cdc_conn_state_flags *txflags =
+ &smc->conn.local_tx_ctrl.conn_state_flags;
+ struct sock *sk = &smc->sk;
+
+ switch (sk->sk_state) {
+ case SMC_ACTIVE:
+ case SMC_APPFINCLOSEWAIT:
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ smc_close_abort(&smc->conn);
+ sk->sk_state = SMC_PROCESSABORT;
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ if (txflags->peer_done_writing &&
+ !txflags->peer_conn_closed) {
+ /* just shutdown, but not yet closed locally */
+ smc_close_abort(&smc->conn);
+ sk->sk_state = SMC_PROCESSABORT;
+ } else {
+ sk->sk_state = SMC_CLOSED;
+ }
+ break;
+ case SMC_PEERFINCLOSEWAIT:
+ case SMC_PEERABORTWAIT:
+ sk->sk_state = SMC_CLOSED;
+ break;
+ case SMC_INIT:
+ case SMC_PROCESSABORT:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+}
+
+/* Some kind of closing has been received: peer_conn_closed, peer_conn_abort,
+ * or peer_done_writing.
+ * Called under tasklet context.
+ */
+void smc_close_passive_received(struct smc_sock *smc)
+{
+ struct smc_cdc_conn_state_flags *rxflags =
+ &smc->conn.local_rx_ctrl.conn_state_flags;
+ struct sock *sk = &smc->sk;
+ int old_state;
+
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+ if (smc->clcsock && smc->clcsock->sk)
+ smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN;
+ sock_set_flag(&smc->sk, SOCK_DONE);
+
+ old_state = sk->sk_state;
+
+ if (rxflags->peer_conn_abort) {
+ smc_close_passive_abort_received(smc);
+ goto wakeup;
+ }
+
+ switch (sk->sk_state) {
+ case SMC_INIT:
+ if (atomic_read(&smc->conn.bytes_to_rcv) ||
+ (rxflags->peer_done_writing &&
+ !rxflags->peer_conn_closed))
+ sk->sk_state = SMC_APPCLOSEWAIT1;
+ else
+ sk->sk_state = SMC_CLOSED;
+ break;
+ case SMC_ACTIVE:
+ sk->sk_state = SMC_APPCLOSEWAIT1;
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ if (rxflags->peer_done_writing)
+ sk->sk_state = SMC_PEERCLOSEWAIT2;
+ /* fall through to check for closing */
+ case SMC_PEERCLOSEWAIT2:
+ case SMC_PEERFINCLOSEWAIT:
+ if (!smc_cdc_rxed_any_close(&smc->conn))
+ break;
+ if (sock_flag(sk, SOCK_DEAD) &&
+ (sk->sk_shutdown == SHUTDOWN_MASK)) {
+ /* smc_release has already been called locally */
+ sk->sk_state = SMC_CLOSED;
+ } else {
+ /* just shutdown, but not yet closed locally */
+ sk->sk_state = SMC_APPFINCLOSEWAIT;
+ }
+ break;
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ case SMC_APPFINCLOSEWAIT:
+ case SMC_PEERABORTWAIT:
+ case SMC_PROCESSABORT:
+ case SMC_CLOSED:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+
+wakeup:
+ if (old_state != sk->sk_state)
+ sk->sk_state_change(sk);
+ sk->sk_data_ready(sk); /* wakeup blocked rcvbuf consumers */
+ sk->sk_write_space(sk); /* wakeup blocked sndbuf producers */
+
+ if ((sk->sk_state == SMC_CLOSED) &&
+ (sock_flag(sk, SOCK_DEAD) || (old_state == SMC_INIT))) {
+ smc_conn_free(&smc->conn);
+ schedule_delayed_work(&smc->sock_put_work,
+ SMC_CLOSE_SOCK_PUT_DELAY);
+ }
+}
+
+void smc_close_sock_put_work(struct work_struct *work)
+{
+ struct smc_sock *smc = container_of(to_delayed_work(work),
+ struct smc_sock,
+ sock_put_work);
+
+ smc->sk.sk_prot->unhash(&smc->sk);
+ sock_put(&smc->sk);
+}
+
+int smc_close_shutdown_write(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;
+ struct sock *sk = &smc->sk;
+ int old_state;
+ int rc = 0;
+
+ if (sock_flag(sk, SOCK_LINGER))
+ timeout = sk->sk_lingertime;
+
+again:
+ old_state = sk->sk_state;
+ switch (old_state) {
+ case SMC_ACTIVE:
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ /* send close wr request */
+ rc = smc_close_wr(conn);
+ if (sk->sk_state == SMC_ACTIVE)
+ sk->sk_state = SMC_PEERCLOSEWAIT1;
+ else
+ goto again;
+ break;
+ case SMC_APPCLOSEWAIT1:
+ /* passive close */
+ if (!smc_cdc_rxed_any_close(conn))
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ /* confirm close from peer */
+ rc = smc_close_wr(conn);
+ sk->sk_state = SMC_APPCLOSEWAIT2;
+ break;
+ case SMC_APPCLOSEWAIT2:
+ case SMC_PEERFINCLOSEWAIT:
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ case SMC_APPFINCLOSEWAIT:
+ case SMC_PROCESSABORT:
+ case SMC_PEERABORTWAIT:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+
+ if (old_state != sk->sk_state)
+ sk->sk_state_change(&smc->sk);
+ return rc;
+}
diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h
new file mode 100644
index 000000000000..bc9a2df3633c
--- /dev/null
+++ b/net/smc/smc_close.h
@@ -0,0 +1,28 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Socket Closing
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_CLOSE_H
+#define SMC_CLOSE_H
+
+#include <linux/workqueue.h>
+
+#include "smc.h"
+
+#define SMC_MAX_STREAM_WAIT_TIMEOUT (2 * HZ)
+#define SMC_CLOSE_SOCK_PUT_DELAY HZ
+
+void smc_close_wake_tx_prepared(struct smc_sock *smc);
+void smc_close_active_abort(struct smc_sock *smc);
+int smc_close_active(struct smc_sock *smc);
+void smc_close_passive_received(struct smc_sock *smc);
+void smc_close_sock_put_work(struct work_struct *work);
+int smc_close_shutdown_write(struct smc_sock *smc);
+
+#endif /* SMC_CLOSE_H */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
new file mode 100644
index 000000000000..0eac633fb354
--- /dev/null
+++ b/net/smc/smc_core.c
@@ -0,0 +1,682 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Basic Transport Functions exploiting Infiniband API
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/socket.h>
+#include <linux/if_vlan.h>
+#include <linux/random.h>
+#include <linux/workqueue.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_clc.h"
+#include "smc_core.h"
+#include "smc_ib.h"
+#include "smc_wr.h"
+#include "smc_llc.h"
+#include "smc_cdc.h"
+#include "smc_close.h"
+
+#define SMC_LGR_NUM_INCR 256
+#define SMC_LGR_FREE_DELAY (600 * HZ)
+
+static u32 smc_lgr_num; /* unique link group number */
+
+/* Register connection's alert token in our lookup structure.
+ * To use rbtrees we have to implement our own insert core.
+ * Requires @conns_lock
+ * @smc connection to register
+ * Returns 0 on success, != otherwise.
+ */
+static void smc_lgr_add_alert_token(struct smc_connection *conn)
+{
+ struct rb_node **link, *parent = NULL;
+ u32 token = conn->alert_token_local;
+
+ link = &conn->lgr->conns_all.rb_node;
+ while (*link) {
+ struct smc_connection *cur = rb_entry(*link,
+ struct smc_connection, alert_node);
+
+ parent = *link;
+ if (cur->alert_token_local > token)
+ link = &parent->rb_left;
+ else
+ link = &parent->rb_right;
+ }
+ /* Put the new node there */
+ rb_link_node(&conn->alert_node, parent, link);
+ rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
+}
+
+/* Register connection in link group by assigning an alert token
+ * registered in a search tree.
+ * Requires @conns_lock
+ * Note that '0' is a reserved value and not assigned.
+ */
+static void smc_lgr_register_conn(struct smc_connection *conn)
+{
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ static atomic_t nexttoken = ATOMIC_INIT(0);
+
+ /* find a new alert_token_local value not yet used by some connection
+ * in this link group
+ */
+ sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
+ while (!conn->alert_token_local) {
+ conn->alert_token_local = atomic_inc_return(&nexttoken);
+ if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
+ conn->alert_token_local = 0;
+ }
+ smc_lgr_add_alert_token(conn);
+ conn->lgr->conns_num++;
+}
+
+/* Unregister connection and reset the alert token of the given connection<
+ */
+static void __smc_lgr_unregister_conn(struct smc_connection *conn)
+{
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ struct smc_link_group *lgr = conn->lgr;
+
+ rb_erase(&conn->alert_node, &lgr->conns_all);
+ lgr->conns_num--;
+ conn->alert_token_local = 0;
+ conn->lgr = NULL;
+ sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
+}
+
+/* Unregister connection and trigger lgr freeing if applicable
+ */
+static void smc_lgr_unregister_conn(struct smc_connection *conn)
+{
+ struct smc_link_group *lgr = conn->lgr;
+ int reduced = 0;
+
+ write_lock_bh(&lgr->conns_lock);
+ if (conn->alert_token_local) {
+ reduced = 1;
+ __smc_lgr_unregister_conn(conn);
+ }
+ write_unlock_bh(&lgr->conns_lock);
+ if (reduced && !lgr->conns_num)
+ schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY);
+}
+
+static void smc_lgr_free_work(struct work_struct *work)
+{
+ struct smc_link_group *lgr = container_of(to_delayed_work(work),
+ struct smc_link_group,
+ free_work);
+ bool conns;
+
+ spin_lock_bh(&smc_lgr_list.lock);
+ read_lock_bh(&lgr->conns_lock);
+ conns = RB_EMPTY_ROOT(&lgr->conns_all);
+ read_unlock_bh(&lgr->conns_lock);
+ if (!conns) { /* number of lgr connections is no longer zero */
+ spin_unlock_bh(&smc_lgr_list.lock);
+ return;
+ }
+ list_del_init(&lgr->list); /* remove from smc_lgr_list */
+ spin_unlock_bh(&smc_lgr_list.lock);
+ smc_lgr_free(lgr);
+}
+
+/* create a new SMC link group */
+static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
+ struct smc_ib_device *smcibdev, u8 ibport,
+ char *peer_systemid, unsigned short vlan_id)
+{
+ struct smc_link_group *lgr;
+ struct smc_link *lnk;
+ u8 rndvec[3];
+ int rc = 0;
+ int i;
+
+ lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
+ if (!lgr) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+ lgr->sync_err = false;
+ lgr->daddr = peer_in_addr;
+ memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
+ lgr->vlan_id = vlan_id;
+ rwlock_init(&lgr->sndbufs_lock);
+ rwlock_init(&lgr->rmbs_lock);
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ INIT_LIST_HEAD(&lgr->sndbufs[i]);
+ INIT_LIST_HEAD(&lgr->rmbs[i]);
+ }
+ smc_lgr_num += SMC_LGR_NUM_INCR;
+ memcpy(&lgr->id, (u8 *)&smc_lgr_num, SMC_LGR_ID_SIZE);
+ INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
+ lgr->conns_all = RB_ROOT;
+
+ lnk = &lgr->lnk[SMC_SINGLE_LINK];
+ /* initialize link */
+ lnk->smcibdev = smcibdev;
+ lnk->ibport = ibport;
+ lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
+ if (!smcibdev->initialized)
+ smc_ib_setup_per_ibdev(smcibdev);
+ get_random_bytes(rndvec, sizeof(rndvec));
+ lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
+ rc = smc_wr_alloc_link_mem(lnk);
+ if (rc)
+ goto free_lgr;
+ init_waitqueue_head(&lnk->wr_tx_wait);
+ rc = smc_ib_create_protection_domain(lnk);
+ if (rc)
+ goto free_link_mem;
+ rc = smc_ib_create_queue_pair(lnk);
+ if (rc)
+ goto dealloc_pd;
+ rc = smc_wr_create_link(lnk);
+ if (rc)
+ goto destroy_qp;
+ init_completion(&lnk->llc_confirm);
+ init_completion(&lnk->llc_confirm_resp);
+
+ smc->conn.lgr = lgr;
+ rwlock_init(&lgr->conns_lock);
+ spin_lock_bh(&smc_lgr_list.lock);
+ list_add(&lgr->list, &smc_lgr_list.list);
+ spin_unlock_bh(&smc_lgr_list.lock);
+ return 0;
+
+destroy_qp:
+ smc_ib_destroy_queue_pair(lnk);
+dealloc_pd:
+ smc_ib_dealloc_protection_domain(lnk);
+free_link_mem:
+ smc_wr_free_link_mem(lnk);
+free_lgr:
+ kfree(lgr);
+out:
+ return rc;
+}
+
+static void smc_sndbuf_unuse(struct smc_connection *conn)
+{
+ if (conn->sndbuf_desc) {
+ conn->sndbuf_desc->used = 0;
+ conn->sndbuf_size = 0;
+ }
+}
+
+static void smc_rmb_unuse(struct smc_connection *conn)
+{
+ if (conn->rmb_desc) {
+ conn->rmb_desc->used = 0;
+ conn->rmbe_size = 0;
+ }
+}
+
+/* remove a finished connection from its link group */
+void smc_conn_free(struct smc_connection *conn)
+{
+ struct smc_link_group *lgr = conn->lgr;
+
+ if (!lgr)
+ return;
+ smc_cdc_tx_dismiss_slots(conn);
+ smc_lgr_unregister_conn(conn);
+ smc_rmb_unuse(conn);
+ smc_sndbuf_unuse(conn);
+}
+
+static void smc_link_clear(struct smc_link *lnk)
+{
+ lnk->peer_qpn = 0;
+ smc_ib_modify_qp_reset(lnk);
+ smc_wr_free_link(lnk);
+ smc_ib_destroy_queue_pair(lnk);
+ smc_ib_dealloc_protection_domain(lnk);
+ smc_wr_free_link_mem(lnk);
+}
+
+static void smc_lgr_free_sndbufs(struct smc_link_group *lgr)
+{
+ struct smc_buf_desc *sndbuf_desc, *bf_desc;
+ int i;
+
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ list_for_each_entry_safe(sndbuf_desc, bf_desc, &lgr->sndbufs[i],
+ list) {
+ list_del(&sndbuf_desc->list);
+ smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+ smc_uncompress_bufsize(i),
+ sndbuf_desc, DMA_TO_DEVICE);
+ kfree(sndbuf_desc->cpu_addr);
+ kfree(sndbuf_desc);
+ }
+ }
+}
+
+static void smc_lgr_free_rmbs(struct smc_link_group *lgr)
+{
+ struct smc_buf_desc *rmb_desc, *bf_desc;
+ struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+ int i;
+
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ list_for_each_entry_safe(rmb_desc, bf_desc, &lgr->rmbs[i],
+ list) {
+ list_del(&rmb_desc->list);
+ smc_ib_buf_unmap(lnk->smcibdev,
+ smc_uncompress_bufsize(i),
+ rmb_desc, DMA_FROM_DEVICE);
+ kfree(rmb_desc->cpu_addr);
+ kfree(rmb_desc);
+ }
+ }
+}
+
+/* remove a link group */
+void smc_lgr_free(struct smc_link_group *lgr)
+{
+ smc_lgr_free_rmbs(lgr);
+ smc_lgr_free_sndbufs(lgr);
+ smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
+ kfree(lgr);
+}
+
+/* terminate linkgroup abnormally */
+void smc_lgr_terminate(struct smc_link_group *lgr)
+{
+ struct smc_connection *conn;
+ struct smc_sock *smc;
+ struct rb_node *node;
+
+ spin_lock_bh(&smc_lgr_list.lock);
+ if (list_empty(&lgr->list)) {
+ /* termination already triggered */
+ spin_unlock_bh(&smc_lgr_list.lock);
+ return;
+ }
+ /* do not use this link group for new connections */
+ list_del_init(&lgr->list);
+ spin_unlock_bh(&smc_lgr_list.lock);
+
+ write_lock_bh(&lgr->conns_lock);
+ node = rb_first(&lgr->conns_all);
+ while (node) {
+ conn = rb_entry(node, struct smc_connection, alert_node);
+ smc = container_of(conn, struct smc_sock, conn);
+ sock_hold(&smc->sk);
+ __smc_lgr_unregister_conn(conn);
+ smc_close_active_abort(smc);
+ sock_put(&smc->sk);
+ node = rb_first(&lgr->conns_all);
+ }
+ write_unlock_bh(&lgr->conns_lock);
+}
+
+/* Determine vlan of internal TCP socket.
+ * @vlan_id: address to store the determined vlan id into
+ */
+static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
+{
+ struct dst_entry *dst = sk_dst_get(clcsock->sk);
+ int rc = 0;
+
+ *vlan_id = 0;
+ if (!dst) {
+ rc = -ENOTCONN;
+ goto out;
+ }
+ if (!dst->dev) {
+ rc = -ENODEV;
+ goto out_rel;
+ }
+
+ if (is_vlan_dev(dst->dev))
+ *vlan_id = vlan_dev_vlan_id(dst->dev);
+
+out_rel:
+ dst_release(dst);
+out:
+ return rc;
+}
+
+/* determine the link gid matching the vlan id of the link group */
+static int smc_link_determine_gid(struct smc_link_group *lgr)
+{
+ struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+ struct ib_gid_attr gattr;
+ union ib_gid gid;
+ int i;
+
+ if (!lgr->vlan_id) {
+ lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1];
+ return 0;
+ }
+
+ for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len;
+ i++) {
+ if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
+ &gattr))
+ continue;
+ if (gattr.ndev &&
+ (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id)) {
+ lnk->gid = gid;
+ return 0;
+ }
+ }
+ return -ENODEV;
+}
+
+/* create a new SMC connection (and a new link group if necessary) */
+int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
+ struct smc_ib_device *smcibdev, u8 ibport,
+ struct smc_clc_msg_local *lcl, int srv_first_contact)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct smc_link_group *lgr;
+ unsigned short vlan_id;
+ enum smc_lgr_role role;
+ int local_contact = SMC_FIRST_CONTACT;
+ int rc = 0;
+
+ role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+ rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id);
+ if (rc)
+ return rc;
+
+ if ((role == SMC_CLNT) && srv_first_contact)
+ /* create new link group as well */
+ goto create;
+
+ /* determine if an existing link group can be reused */
+ spin_lock_bh(&smc_lgr_list.lock);
+ list_for_each_entry(lgr, &smc_lgr_list.list, list) {
+ write_lock_bh(&lgr->conns_lock);
+ if (!memcmp(lgr->peer_systemid, lcl->id_for_peer,
+ SMC_SYSTEMID_LEN) &&
+ !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
+ SMC_GID_SIZE) &&
+ !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
+ sizeof(lcl->mac)) &&
+ !lgr->sync_err &&
+ (lgr->role == role) &&
+ (lgr->vlan_id == vlan_id) &&
+ ((role == SMC_CLNT) ||
+ (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) {
+ /* link group found */
+ local_contact = SMC_REUSE_CONTACT;
+ conn->lgr = lgr;
+ smc_lgr_register_conn(conn); /* add smc conn to lgr */
+ write_unlock_bh(&lgr->conns_lock);
+ break;
+ }
+ write_unlock_bh(&lgr->conns_lock);
+ }
+ spin_unlock_bh(&smc_lgr_list.lock);
+
+ if (role == SMC_CLNT && !srv_first_contact &&
+ (local_contact == SMC_FIRST_CONTACT)) {
+ /* Server reuses a link group, but Client wants to start
+ * a new one
+ * send out_of_sync decline, reason synchr. error
+ */
+ return -ENOLINK;
+ }
+
+create:
+ if (local_contact == SMC_FIRST_CONTACT) {
+ rc = smc_lgr_create(smc, peer_in_addr, smcibdev, ibport,
+ lcl->id_for_peer, vlan_id);
+ if (rc)
+ goto out;
+ smc_lgr_register_conn(conn); /* add smc conn to lgr */
+ rc = smc_link_determine_gid(conn->lgr);
+ }
+ conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
+ conn->local_tx_ctrl.len = sizeof(struct smc_cdc_msg);
+#ifndef KERNEL_HAS_ATOMIC64
+ spin_lock_init(&conn->acurs_lock);
+#endif
+
+out:
+ return rc ? rc : local_contact;
+}
+
+/* try to reuse a sndbuf description slot of the sndbufs list for a certain
+ * buf_size; if not available, return NULL
+ */
+static inline
+struct smc_buf_desc *smc_sndbuf_get_slot(struct smc_link_group *lgr,
+ int compressed_bufsize)
+{
+ struct smc_buf_desc *sndbuf_slot;
+
+ read_lock_bh(&lgr->sndbufs_lock);
+ list_for_each_entry(sndbuf_slot, &lgr->sndbufs[compressed_bufsize],
+ list) {
+ if (cmpxchg(&sndbuf_slot->used, 0, 1) == 0) {
+ read_unlock_bh(&lgr->sndbufs_lock);
+ return sndbuf_slot;
+ }
+ }
+ read_unlock_bh(&lgr->sndbufs_lock);
+ return NULL;
+}
+
+/* try to reuse an rmb description slot of the rmbs list for a certain
+ * rmbe_size; if not available, return NULL
+ */
+static inline
+struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr,
+ int compressed_bufsize)
+{
+ struct smc_buf_desc *rmb_slot;
+
+ read_lock_bh(&lgr->rmbs_lock);
+ list_for_each_entry(rmb_slot, &lgr->rmbs[compressed_bufsize],
+ list) {
+ if (cmpxchg(&rmb_slot->used, 0, 1) == 0) {
+ read_unlock_bh(&lgr->rmbs_lock);
+ return rmb_slot;
+ }
+ }
+ read_unlock_bh(&lgr->rmbs_lock);
+ return NULL;
+}
+
+/* one of the conditions for announcing a receiver's current window size is
+ * that it "results in a minimum increase in the window size of 10% of the
+ * receive buffer space" [RFC7609]
+ */
+static inline int smc_rmb_wnd_update_limit(int rmbe_size)
+{
+ return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
+}
+
+/* create the tx buffer for an SMC socket */
+int smc_sndbuf_create(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct smc_link_group *lgr = conn->lgr;
+ int tmp_bufsize, tmp_bufsize_short;
+ struct smc_buf_desc *sndbuf_desc;
+ int rc;
+
+ /* use socket send buffer size (w/o overhead) as start value */
+ for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2);
+ tmp_bufsize_short >= 0; tmp_bufsize_short--) {
+ tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
+ /* check for reusable sndbuf_slot in the link group */
+ sndbuf_desc = smc_sndbuf_get_slot(lgr, tmp_bufsize_short);
+ if (sndbuf_desc) {
+ memset(sndbuf_desc->cpu_addr, 0, tmp_bufsize);
+ break; /* found reusable slot */
+ }
+ /* try to alloc a new send buffer */
+ sndbuf_desc = kzalloc(sizeof(*sndbuf_desc), GFP_KERNEL);
+ if (!sndbuf_desc)
+ break; /* give up with -ENOMEM */
+ sndbuf_desc->cpu_addr = kzalloc(tmp_bufsize,
+ GFP_KERNEL | __GFP_NOWARN |
+ __GFP_NOMEMALLOC |
+ __GFP_NORETRY);
+ if (!sndbuf_desc->cpu_addr) {
+ kfree(sndbuf_desc);
+ sndbuf_desc = NULL;
+ /* if send buffer allocation has failed,
+ * try a smaller one
+ */
+ continue;
+ }
+ rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+ tmp_bufsize, sndbuf_desc,
+ DMA_TO_DEVICE);
+ if (rc) {
+ kfree(sndbuf_desc->cpu_addr);
+ kfree(sndbuf_desc);
+ sndbuf_desc = NULL;
+ continue; /* if mapping failed, try smaller one */
+ }
+ sndbuf_desc->used = 1;
+ write_lock_bh(&lgr->sndbufs_lock);
+ list_add(&sndbuf_desc->list,
+ &lgr->sndbufs[tmp_bufsize_short]);
+ write_unlock_bh(&lgr->sndbufs_lock);
+ break;
+ }
+ if (sndbuf_desc && sndbuf_desc->cpu_addr) {
+ conn->sndbuf_desc = sndbuf_desc;
+ conn->sndbuf_size = tmp_bufsize;
+ smc->sk.sk_sndbuf = tmp_bufsize * 2;
+ atomic_set(&conn->sndbuf_space, tmp_bufsize);
+ return 0;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+/* create the RMB for an SMC socket (even though the SMC protocol
+ * allows more than one RMB-element per RMB, the Linux implementation
+ * uses just one RMB-element per RMB, i.e. uses an extra RMB for every
+ * connection in a link group
+ */
+int smc_rmb_create(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct smc_link_group *lgr = conn->lgr;
+ int tmp_bufsize, tmp_bufsize_short;
+ struct smc_buf_desc *rmb_desc;
+ int rc;
+
+ /* use socket recv buffer size (w/o overhead) as start value */
+ for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_rcvbuf / 2);
+ tmp_bufsize_short >= 0; tmp_bufsize_short--) {
+ tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
+ /* check for reusable rmb_slot in the link group */
+ rmb_desc = smc_rmb_get_slot(lgr, tmp_bufsize_short);
+ if (rmb_desc) {
+ memset(rmb_desc->cpu_addr, 0, tmp_bufsize);
+ break; /* found reusable slot */
+ }
+ /* try to alloc a new RMB */
+ rmb_desc = kzalloc(sizeof(*rmb_desc), GFP_KERNEL);
+ if (!rmb_desc)
+ break; /* give up with -ENOMEM */
+ rmb_desc->cpu_addr = kzalloc(tmp_bufsize,
+ GFP_KERNEL | __GFP_NOWARN |
+ __GFP_NOMEMALLOC |
+ __GFP_NORETRY);
+ if (!rmb_desc->cpu_addr) {
+ kfree(rmb_desc);
+ rmb_desc = NULL;
+ /* if RMB allocation has failed,
+ * try a smaller one
+ */
+ continue;
+ }
+ rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+ tmp_bufsize, rmb_desc,
+ DMA_FROM_DEVICE);
+ if (rc) {
+ kfree(rmb_desc->cpu_addr);
+ kfree(rmb_desc);
+ rmb_desc = NULL;
+ continue; /* if mapping failed, try smaller one */
+ }
+ rc = smc_ib_get_memory_region(lgr->lnk[SMC_SINGLE_LINK].roce_pd,
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_LOCAL_WRITE,
+ &rmb_desc->mr_rx[SMC_SINGLE_LINK]);
+ if (rc) {
+ smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+ tmp_bufsize, rmb_desc,
+ DMA_FROM_DEVICE);
+ kfree(rmb_desc->cpu_addr);
+ kfree(rmb_desc);
+ rmb_desc = NULL;
+ continue;
+ }
+ rmb_desc->used = 1;
+ write_lock_bh(&lgr->rmbs_lock);
+ list_add(&rmb_desc->list,
+ &lgr->rmbs[tmp_bufsize_short]);
+ write_unlock_bh(&lgr->rmbs_lock);
+ break;
+ }
+ if (rmb_desc && rmb_desc->cpu_addr) {
+ conn->rmb_desc = rmb_desc;
+ conn->rmbe_size = tmp_bufsize;
+ conn->rmbe_size_short = tmp_bufsize_short;
+ smc->sk.sk_rcvbuf = tmp_bufsize * 2;
+ atomic_set(&conn->bytes_to_rcv, 0);
+ conn->rmbe_update_limit = smc_rmb_wnd_update_limit(tmp_bufsize);
+ return 0;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
+{
+ int i;
+
+ for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
+ if (!test_and_set_bit(i, lgr->rtokens_used_mask))
+ return i;
+ }
+ return -ENOSPC;
+}
+
+/* save rkey and dma_addr received from peer during clc handshake */
+int smc_rmb_rtoken_handling(struct smc_connection *conn,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ u64 dma_addr = be64_to_cpu(clc->rmb_dma_addr);
+ struct smc_link_group *lgr = conn->lgr;
+ u32 rkey = ntohl(clc->rmb_rkey);
+ int i;
+
+ for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+ if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
+ test_bit(i, lgr->rtokens_used_mask)) {
+ conn->rtoken_idx = i;
+ return 0;
+ }
+ }
+ conn->rtoken_idx = smc_rmb_reserve_rtoken_idx(lgr);
+ if (conn->rtoken_idx < 0)
+ return conn->rtoken_idx;
+ lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey = rkey;
+ lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr = dma_addr;
+ return 0;
+}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
new file mode 100644
index 000000000000..27eb38056a27
--- /dev/null
+++ b/net/smc/smc_core.h
@@ -0,0 +1,181 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for SMC Connections, Link Groups and Links
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_CORE_H
+#define _SMC_CORE_H
+
+#include <linux/atomic.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_ib.h"
+
+#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */
+
+struct smc_lgr_list { /* list of link group definition */
+ struct list_head list;
+ spinlock_t lock; /* protects list of link groups */
+};
+
+extern struct smc_lgr_list smc_lgr_list; /* list of link groups */
+
+enum smc_lgr_role { /* possible roles of a link group */
+ SMC_CLNT, /* client */
+ SMC_SERV /* server */
+};
+
+#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */
+
+struct smc_wr_buf {
+ u8 raw[SMC_WR_BUF_SIZE];
+};
+
+struct smc_link {
+ struct smc_ib_device *smcibdev; /* ib-device */
+ u8 ibport; /* port - values 1 | 2 */
+ struct ib_pd *roce_pd; /* IB protection domain,
+ * unique for every RoCE QP
+ */
+ struct ib_qp *roce_qp; /* IB queue pair */
+ struct ib_qp_attr qp_attr; /* IB queue pair attributes */
+
+ struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */
+ struct ib_send_wr *wr_tx_ibs; /* WR send meta data */
+ struct ib_sge *wr_tx_sges; /* WR send gather meta data */
+ struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */
+ /* above four vectors have wr_tx_cnt elements and use the same index */
+ dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */
+ atomic_long_t wr_tx_id; /* seq # of last sent WR */
+ unsigned long *wr_tx_mask; /* bit mask of used indexes */
+ u32 wr_tx_cnt; /* number of WR send buffers */
+ wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */
+
+ struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */
+ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
+ struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */
+ /* above three vectors have wr_rx_cnt elements and use the same index */
+ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
+ u64 wr_rx_id; /* seq # of last recv WR */
+ u32 wr_rx_cnt; /* number of WR recv buffers */
+
+ union ib_gid gid; /* gid matching used vlan id */
+ u32 peer_qpn; /* QP number of peer */
+ enum ib_mtu path_mtu; /* used mtu */
+ enum ib_mtu peer_mtu; /* mtu size of peer */
+ u32 psn_initial; /* QP tx initial packet seqno */
+ u32 peer_psn; /* QP rx initial packet seqno */
+ u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */
+ u8 peer_gid[sizeof(union ib_gid)]; /* gid of peer*/
+ u8 link_id; /* unique # within link group */
+ struct completion llc_confirm; /* wait for rx of conf link */
+ struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */
+};
+
+/* For now we just allow one parallel link per link group. The SMC protocol
+ * allows more (up to 8).
+ */
+#define SMC_LINKS_PER_LGR_MAX 1
+#define SMC_SINGLE_LINK 0
+
+#define SMC_FIRST_CONTACT 1 /* first contact to a peer */
+#define SMC_REUSE_CONTACT 0 /* follow-on contact to a peer*/
+
+/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
+struct smc_buf_desc {
+ struct list_head list;
+ u64 dma_addr[SMC_LINKS_PER_LGR_MAX];
+ /* mapped address of buffer */
+ void *cpu_addr; /* virtual address of buffer */
+ struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
+ /* for rmb only:
+ * rkey provided to peer
+ */
+ u32 used; /* currently used / unused */
+};
+
+struct smc_rtoken { /* address/key of remote RMB */
+ u64 dma_addr;
+ u32 rkey;
+};
+
+#define SMC_LGR_ID_SIZE 4
+
+struct smc_link_group {
+ struct list_head list;
+ enum smc_lgr_role role; /* client or server */
+ __be32 daddr; /* destination ip address */
+ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */
+ char peer_systemid[SMC_SYSTEMID_LEN];
+ /* unique system_id of peer */
+ struct rb_root conns_all; /* connection tree */
+ rwlock_t conns_lock; /* protects conns_all */
+ unsigned int conns_num; /* current # of connections */
+ unsigned short vlan_id; /* vlan id of link group */
+
+ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */
+ rwlock_t sndbufs_lock; /* protects tx buffers */
+ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */
+ rwlock_t rmbs_lock; /* protects rx buffers */
+ struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
+ [SMC_LINKS_PER_LGR_MAX];
+ /* remote addr/key pairs */
+ unsigned long rtokens_used_mask[BITS_TO_LONGS(
+ SMC_RMBS_PER_LGR_MAX)];
+ /* used rtoken elements */
+
+ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
+ struct delayed_work free_work; /* delayed freeing of an lgr */
+ bool sync_err; /* lgr no longer fits to peer */
+};
+
+/* Find the connection associated with the given alert token in the link group.
+ * To use rbtrees we have to implement our own search core.
+ * Requires @conns_lock
+ * @token alert token to search for
+ * @lgr link group to search in
+ * Returns connection associated with token if found, NULL otherwise.
+ */
+static inline struct smc_connection *smc_lgr_find_conn(
+ u32 token, struct smc_link_group *lgr)
+{
+ struct smc_connection *res = NULL;
+ struct rb_node *node;
+
+ node = lgr->conns_all.rb_node;
+ while (node) {
+ struct smc_connection *cur = rb_entry(node,
+ struct smc_connection, alert_node);
+
+ if (cur->alert_token_local > token) {
+ node = node->rb_left;
+ } else {
+ if (cur->alert_token_local < token) {
+ node = node->rb_right;
+ } else {
+ res = cur;
+ break;
+ }
+ }
+ }
+
+ return res;
+}
+
+struct smc_sock;
+struct smc_clc_msg_accept_confirm;
+
+void smc_lgr_free(struct smc_link_group *lgr);
+void smc_lgr_terminate(struct smc_link_group *lgr);
+int smc_sndbuf_create(struct smc_sock *smc);
+int smc_rmb_create(struct smc_sock *smc);
+int smc_rmb_rtoken_handling(struct smc_connection *conn,
+ struct smc_clc_msg_accept_confirm *clc);
+
+#endif
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
new file mode 100644
index 000000000000..d2d01cf70224
--- /dev/null
+++ b/net/smc/smc_diag.c
@@ -0,0 +1,215 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Monitoring SMC transport protocol sockets
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/sock_diag.h>
+#include <linux/inet_diag.h>
+#include <linux/smc_diag.h>
+#include <net/netlink.h>
+#include <net/smc.h>
+
+#include "smc.h"
+#include "smc_core.h"
+
+static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
+{
+ sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
+ be16_to_cpu(((__be16 *)gid_raw)[0]),
+ be16_to_cpu(((__be16 *)gid_raw)[1]),
+ be16_to_cpu(((__be16 *)gid_raw)[2]),
+ be16_to_cpu(((__be16 *)gid_raw)[3]),
+ be16_to_cpu(((__be16 *)gid_raw)[4]),
+ be16_to_cpu(((__be16 *)gid_raw)[5]),
+ be16_to_cpu(((__be16 *)gid_raw)[6]),
+ be16_to_cpu(((__be16 *)gid_raw)[7]));
+}
+
+static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ r->diag_family = sk->sk_family;
+ if (!smc->clcsock)
+ return;
+ r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
+ r->id.idiag_dport = smc->clcsock->sk->sk_dport;
+ r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
+ sock_diag_save_cookie(sk, r->id.idiag_cookie);
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+ r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
+ r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
+}
+
+static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
+ struct smc_diag_msg *r,
+ struct user_namespace *user_ns)
+{
+ if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown))
+ return 1;
+
+ r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+ r->diag_inode = sock_i_ino(sk);
+ return 0;
+}
+
+static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct smc_diag_req *req,
+ struct nlattr *bc)
+{
+ struct smc_sock *smc = smc_sk(sk);
+ struct user_namespace *user_ns;
+ struct smc_diag_msg *r;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ smc_diag_msg_common_fill(r, sk);
+ r->diag_state = sk->sk_state;
+ r->diag_fallback = smc->use_fallback;
+ user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk);
+ if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns))
+ goto errout;
+
+ if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.lgr) {
+ struct smc_connection *conn = &smc->conn;
+ struct smc_diag_conninfo cinfo = {
+ .token = conn->alert_token_local,
+ .sndbuf_size = conn->sndbuf_size,
+ .rmbe_size = conn->rmbe_size,
+ .peer_rmbe_size = conn->peer_rmbe_size,
+
+ .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap,
+ .rx_prod.count = conn->local_rx_ctrl.prod.count,
+ .rx_cons.wrap = conn->local_rx_ctrl.cons.wrap,
+ .rx_cons.count = conn->local_rx_ctrl.cons.count,
+
+ .tx_prod.wrap = conn->local_tx_ctrl.prod.wrap,
+ .tx_prod.count = conn->local_tx_ctrl.prod.count,
+ .tx_cons.wrap = conn->local_tx_ctrl.cons.wrap,
+ .tx_cons.count = conn->local_tx_ctrl.cons.count,
+
+ .tx_prod_flags =
+ *(u8 *)&conn->local_tx_ctrl.prod_flags,
+ .tx_conn_state_flags =
+ *(u8 *)&conn->local_tx_ctrl.conn_state_flags,
+ .rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags,
+ .rx_conn_state_flags =
+ *(u8 *)&conn->local_rx_ctrl.conn_state_flags,
+
+ .tx_prep.wrap = conn->tx_curs_prep.wrap,
+ .tx_prep.count = conn->tx_curs_prep.count,
+ .tx_sent.wrap = conn->tx_curs_sent.wrap,
+ .tx_sent.count = conn->tx_curs_sent.count,
+ .tx_fin.wrap = conn->tx_curs_fin.wrap,
+ .tx_fin.count = conn->tx_curs_fin.count,
+ };
+
+ if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0)
+ goto errout;
+ }
+
+ if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr) {
+ struct smc_diag_lgrinfo linfo = {
+ .role = smc->conn.lgr->role,
+ .lnk[0].ibport = smc->conn.lgr->lnk[0].ibport,
+ .lnk[0].link_id = smc->conn.lgr->lnk[0].link_id,
+ };
+
+ memcpy(linfo.lnk[0].ibname,
+ smc->conn.lgr->lnk[0].smcibdev->ibdev->name,
+ sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name));
+ smc_gid_be16_convert(linfo.lnk[0].gid,
+ smc->conn.lgr->lnk[0].gid.raw);
+ smc_gid_be16_convert(linfo.lnk[0].peer_gid,
+ smc->conn.lgr->lnk[0].peer_gid);
+
+ if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
+ goto errout;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+errout:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *bc = NULL;
+ struct hlist_head *head;
+ struct sock *sk;
+ int rc = 0;
+
+ read_lock(&smc_proto.h.smc_hash->lock);
+ head = &smc_proto.h.smc_hash->ht;
+ if (hlist_empty(head))
+ goto out;
+
+ sk_for_each(sk, head) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc);
+ if (rc)
+ break;
+ }
+
+out:
+ read_unlock(&smc_proto.h.smc_hash->lock);
+ return rc;
+}
+
+static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+{
+ struct net *net = sock_net(skb->sk);
+
+ if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
+ h->nlmsg_flags & NLM_F_DUMP) {
+ {
+ struct netlink_dump_control c = {
+ .dump = smc_diag_dump,
+ .min_dump_alloc = SKB_WITH_OVERHEAD(32768),
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+ }
+ }
+ return 0;
+}
+
+static const struct sock_diag_handler smc_diag_handler = {
+ .family = AF_SMC,
+ .dump = smc_diag_handler_dump,
+};
+
+static int __init smc_diag_init(void)
+{
+ return sock_diag_register(&smc_diag_handler);
+}
+
+static void __exit smc_diag_exit(void)
+{
+ sock_diag_unregister(&smc_diag_handler);
+}
+
+module_init(smc_diag_init);
+module_exit(smc_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
new file mode 100644
index 000000000000..e6743c008ac5
--- /dev/null
+++ b/net/smc/smc_ib.c
@@ -0,0 +1,466 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * IB infrastructure:
+ * Establish SMC-R as an Infiniband Client to be notified about added and
+ * removed IB devices of type RDMA.
+ * Determine device and port characteristics for these IB devices.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/random.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc_pnet.h"
+#include "smc_ib.h"
+#include "smc_core.h"
+#include "smc_wr.h"
+#include "smc.h"
+
+#define SMC_QP_MIN_RNR_TIMER 5
+#define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */
+#define SMC_QP_RETRY_CNT 7 /* 7: infinite */
+#define SMC_QP_RNR_RETRY 7 /* 7: infinite */
+
+struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
+ .lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock),
+ .list = LIST_HEAD_INIT(smc_ib_devices.list),
+};
+
+#define SMC_LOCAL_SYSTEMID_RESET "%%%%%%%"
+
+u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system
+ * identifier
+ */
+
+int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
+ struct ib_mr **mr)
+{
+ int rc;
+
+ if (*mr)
+ return 0; /* already done */
+
+ /* obtain unique key -
+ * next invocation of get_dma_mr returns a different key!
+ */
+ *mr = pd->device->get_dma_mr(pd, access_flags);
+ rc = PTR_ERR_OR_ZERO(*mr);
+ if (IS_ERR(*mr))
+ *mr = NULL;
+ return rc;
+}
+
+static int smc_ib_modify_qp_init(struct smc_link *lnk)
+{
+ struct ib_qp_attr qp_attr;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_INIT;
+ qp_attr.pkey_index = 0;
+ qp_attr.port_num = lnk->ibport;
+ qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE
+ | IB_ACCESS_REMOTE_WRITE;
+ return ib_modify_qp(lnk->roce_qp, &qp_attr,
+ IB_QP_STATE | IB_QP_PKEY_INDEX |
+ IB_QP_ACCESS_FLAGS | IB_QP_PORT);
+}
+
+static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
+{
+ enum ib_qp_attr_mask qp_attr_mask =
+ IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
+ struct ib_qp_attr qp_attr;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_RTR;
+ qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
+ qp_attr.ah_attr.port_num = lnk->ibport;
+ qp_attr.ah_attr.ah_flags = IB_AH_GRH;
+ qp_attr.ah_attr.grh.hop_limit = 1;
+ memcpy(&qp_attr.ah_attr.grh.dgid, lnk->peer_gid,
+ sizeof(lnk->peer_gid));
+ memcpy(&qp_attr.ah_attr.dmac, lnk->peer_mac,
+ sizeof(lnk->peer_mac));
+ qp_attr.dest_qp_num = lnk->peer_qpn;
+ qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
+ qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
+ * requests
+ */
+ qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER;
+
+ return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask);
+}
+
+int smc_ib_modify_qp_rts(struct smc_link *lnk)
+{
+ struct ib_qp_attr qp_attr;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_RTS;
+ qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */
+ qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */
+ qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */
+ qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */
+ qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and
+ * atomic ops allowed
+ */
+ return ib_modify_qp(lnk->roce_qp, &qp_attr,
+ IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
+ IB_QP_SQ_PSN | IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC);
+}
+
+int smc_ib_modify_qp_reset(struct smc_link *lnk)
+{
+ struct ib_qp_attr qp_attr;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_RESET;
+ return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
+}
+
+int smc_ib_ready_link(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr =
+ container_of(lnk, struct smc_link_group, lnk[0]);
+ int rc = 0;
+
+ rc = smc_ib_modify_qp_init(lnk);
+ if (rc)
+ goto out;
+
+ rc = smc_ib_modify_qp_rtr(lnk);
+ if (rc)
+ goto out;
+ smc_wr_remember_qp_attr(lnk);
+ rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
+ IB_CQ_SOLICITED_MASK);
+ if (rc)
+ goto out;
+ rc = smc_wr_rx_post_init(lnk);
+ if (rc)
+ goto out;
+ smc_wr_remember_qp_attr(lnk);
+
+ if (lgr->role == SMC_SERV) {
+ rc = smc_ib_modify_qp_rts(lnk);
+ if (rc)
+ goto out;
+ smc_wr_remember_qp_attr(lnk);
+ }
+out:
+ return rc;
+}
+
+/* process context wrapper for might_sleep smc_ib_remember_port_attr */
+static void smc_ib_port_event_work(struct work_struct *work)
+{
+ struct smc_ib_device *smcibdev = container_of(
+ work, struct smc_ib_device, port_event_work);
+ u8 port_idx;
+
+ for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
+ smc_ib_remember_port_attr(smcibdev, port_idx + 1);
+ clear_bit(port_idx, &smcibdev->port_event_mask);
+ }
+}
+
+/* can be called in IRQ context */
+static void smc_ib_global_event_handler(struct ib_event_handler *handler,
+ struct ib_event *ibevent)
+{
+ struct smc_ib_device *smcibdev;
+ u8 port_idx;
+
+ smcibdev = container_of(handler, struct smc_ib_device, event_handler);
+ if (!smc_pnet_find_ib(smcibdev->ibdev->name))
+ return;
+
+ switch (ibevent->event) {
+ case IB_EVENT_PORT_ERR:
+ port_idx = ibevent->element.port_num - 1;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ schedule_work(&smcibdev->port_event_work);
+ /* fall through */
+ case IB_EVENT_DEVICE_FATAL:
+ /* tbd in follow-on patch:
+ * abnormal close of corresponding connections
+ */
+ break;
+ case IB_EVENT_PORT_ACTIVE:
+ port_idx = ibevent->element.port_num - 1;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ schedule_work(&smcibdev->port_event_work);
+ break;
+ default:
+ break;
+ }
+}
+
+void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
+{
+ ib_dealloc_pd(lnk->roce_pd);
+ lnk->roce_pd = NULL;
+}
+
+int smc_ib_create_protection_domain(struct smc_link *lnk)
+{
+ int rc;
+
+ lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
+ rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
+ if (IS_ERR(lnk->roce_pd))
+ lnk->roce_pd = NULL;
+ return rc;
+}
+
+static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
+{
+ switch (ibevent->event) {
+ case IB_EVENT_DEVICE_FATAL:
+ case IB_EVENT_GID_CHANGE:
+ case IB_EVENT_PORT_ERR:
+ case IB_EVENT_QP_ACCESS_ERR:
+ /* tbd in follow-on patch:
+ * abnormal close of corresponding connections
+ */
+ break;
+ default:
+ break;
+ }
+}
+
+void smc_ib_destroy_queue_pair(struct smc_link *lnk)
+{
+ ib_destroy_qp(lnk->roce_qp);
+ lnk->roce_qp = NULL;
+}
+
+/* create a queue pair within the protection domain for a link */
+int smc_ib_create_queue_pair(struct smc_link *lnk)
+{
+ struct ib_qp_init_attr qp_attr = {
+ .event_handler = smc_ib_qp_event_handler,
+ .qp_context = lnk,
+ .send_cq = lnk->smcibdev->roce_cq_send,
+ .recv_cq = lnk->smcibdev->roce_cq_recv,
+ .srq = NULL,
+ .cap = {
+ .max_send_wr = SMC_WR_BUF_CNT,
+ /* include unsolicited rdma_writes as well,
+ * there are max. 2 RDMA_WRITE per 1 WR_SEND
+ */
+ .max_recv_wr = SMC_WR_BUF_CNT * 3,
+ .max_send_sge = SMC_IB_MAX_SEND_SGE,
+ .max_recv_sge = 1,
+ .max_inline_data = SMC_WR_TX_SIZE,
+ },
+ .sq_sig_type = IB_SIGNAL_REQ_WR,
+ .qp_type = IB_QPT_RC,
+ };
+ int rc;
+
+ lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
+ rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
+ if (IS_ERR(lnk->roce_qp))
+ lnk->roce_qp = NULL;
+ else
+ smc_wr_remember_qp_attr(lnk);
+ return rc;
+}
+
+/* map a new TX or RX buffer to DMA */
+int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction)
+{
+ int rc = 0;
+
+ if (buf_slot->dma_addr[SMC_SINGLE_LINK])
+ return rc; /* already mapped */
+ buf_slot->dma_addr[SMC_SINGLE_LINK] =
+ ib_dma_map_single(smcibdev->ibdev, buf_slot->cpu_addr,
+ buf_size, data_direction);
+ if (ib_dma_mapping_error(smcibdev->ibdev,
+ buf_slot->dma_addr[SMC_SINGLE_LINK]))
+ rc = -EIO;
+ return rc;
+}
+
+void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int buf_size,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction)
+{
+ if (!buf_slot->dma_addr[SMC_SINGLE_LINK])
+ return; /* already unmapped */
+ ib_dma_unmap_single(smcibdev->ibdev, *buf_slot->dma_addr, buf_size,
+ data_direction);
+ buf_slot->dma_addr[SMC_SINGLE_LINK] = 0;
+}
+
+static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ struct net_device *ndev;
+ int rc;
+
+ rc = ib_query_gid(smcibdev->ibdev, ibport, 0,
+ &smcibdev->gid[ibport - 1], NULL);
+ /* the SMC protocol requires specification of the roce MAC address;
+ * if net_device cannot be determined, it can be derived from gid 0
+ */
+ ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport);
+ if (ndev) {
+ memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN);
+ } else if (!rc) {
+ memcpy(&smcibdev->mac[ibport - 1][0],
+ &smcibdev->gid[ibport - 1].raw[8], 3);
+ memcpy(&smcibdev->mac[ibport - 1][3],
+ &smcibdev->gid[ibport - 1].raw[13], 3);
+ smcibdev->mac[ibport - 1][0] &= ~0x02;
+ }
+ return rc;
+}
+
+/* Create an identifier unique for this instance of SMC-R.
+ * The MAC-address of the first active registered IB device
+ * plus a random 2-byte number is used to create this identifier.
+ * This name is delivered to the peer during connection initialization.
+ */
+static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
+ u8 ibport)
+{
+ memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
+ sizeof(smcibdev->mac[ibport - 1]));
+ get_random_bytes(&local_systemid[0], 2);
+}
+
+bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
+}
+
+int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ int rc;
+
+ memset(&smcibdev->pattr[ibport - 1], 0,
+ sizeof(smcibdev->pattr[ibport - 1]));
+ rc = ib_query_port(smcibdev->ibdev, ibport,
+ &smcibdev->pattr[ibport - 1]);
+ if (rc)
+ goto out;
+ rc = smc_ib_fill_gid_and_mac(smcibdev, ibport);
+ if (rc)
+ goto out;
+ if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
+ sizeof(local_systemid)) &&
+ smc_ib_port_active(smcibdev, ibport))
+ /* create unique system identifier */
+ smc_ib_define_local_systemid(smcibdev, ibport);
+out:
+ return rc;
+}
+
+long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
+{
+ struct ib_cq_init_attr cqattr = {
+ .cqe = SMC_WR_MAX_CQE, .comp_vector = 0 };
+ long rc;
+
+ smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
+ smc_wr_tx_cq_handler, NULL,
+ smcibdev, &cqattr);
+ rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
+ if (IS_ERR(smcibdev->roce_cq_send)) {
+ smcibdev->roce_cq_send = NULL;
+ return rc;
+ }
+ smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
+ smc_wr_rx_cq_handler, NULL,
+ smcibdev, &cqattr);
+ rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv);
+ if (IS_ERR(smcibdev->roce_cq_recv)) {
+ smcibdev->roce_cq_recv = NULL;
+ goto err;
+ }
+ INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
+ smc_ib_global_event_handler);
+ ib_register_event_handler(&smcibdev->event_handler);
+ smc_wr_add_dev(smcibdev);
+ smcibdev->initialized = 1;
+ return rc;
+
+err:
+ ib_destroy_cq(smcibdev->roce_cq_send);
+ return rc;
+}
+
+static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
+{
+ if (!smcibdev->initialized)
+ return;
+ smc_wr_remove_dev(smcibdev);
+ ib_unregister_event_handler(&smcibdev->event_handler);
+ ib_destroy_cq(smcibdev->roce_cq_recv);
+ ib_destroy_cq(smcibdev->roce_cq_send);
+}
+
+static struct ib_client smc_ib_client;
+
+/* callback function for ib_register_client() */
+static void smc_ib_add_dev(struct ib_device *ibdev)
+{
+ struct smc_ib_device *smcibdev;
+
+ if (ibdev->node_type != RDMA_NODE_IB_CA)
+ return;
+
+ smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
+ if (!smcibdev)
+ return;
+
+ smcibdev->ibdev = ibdev;
+ INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
+
+ spin_lock(&smc_ib_devices.lock);
+ list_add_tail(&smcibdev->list, &smc_ib_devices.list);
+ spin_unlock(&smc_ib_devices.lock);
+ ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
+}
+
+/* callback function for ib_register_client() */
+static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
+{
+ struct smc_ib_device *smcibdev;
+
+ smcibdev = ib_get_client_data(ibdev, &smc_ib_client);
+ ib_set_client_data(ibdev, &smc_ib_client, NULL);
+ spin_lock(&smc_ib_devices.lock);
+ list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
+ spin_unlock(&smc_ib_devices.lock);
+ smc_pnet_remove_by_ibdev(smcibdev);
+ smc_ib_cleanup_per_ibdev(smcibdev);
+ kfree(smcibdev);
+}
+
+static struct ib_client smc_ib_client = {
+ .name = "smc_ib",
+ .add = smc_ib_add_dev,
+ .remove = smc_ib_remove_dev,
+};
+
+int __init smc_ib_register_client(void)
+{
+ return ib_register_client(&smc_ib_client);
+}
+
+void smc_ib_unregister_client(void)
+{
+ ib_unregister_client(&smc_ib_client);
+}
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
new file mode 100644
index 000000000000..a95f74bb5569
--- /dev/null
+++ b/net/smc/smc_ib.h
@@ -0,0 +1,71 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for IB environment
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <Ursula Braun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_IB_H
+#define _SMC_IB_H
+
+#include <linux/if_ether.h>
+#include <rdma/ib_verbs.h>
+
+#define SMC_MAX_PORTS 2 /* Max # of ports */
+#define SMC_GID_SIZE sizeof(union ib_gid)
+
+#define SMC_IB_MAX_SEND_SGE 2
+
+struct smc_ib_devices { /* list of smc ib devices definition */
+ struct list_head list;
+ spinlock_t lock; /* protects list of smc ib devices */
+};
+
+extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */
+
+struct smc_ib_device { /* ib-device infos for smc */
+ struct list_head list;
+ struct ib_device *ibdev;
+ struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */
+ struct ib_event_handler event_handler; /* global ib_event handler */
+ struct ib_cq *roce_cq_send; /* send completion queue */
+ struct ib_cq *roce_cq_recv; /* recv completion queue */
+ struct tasklet_struct send_tasklet; /* called by send cq handler */
+ struct tasklet_struct recv_tasklet; /* called by recv cq handler */
+ char mac[SMC_MAX_PORTS][ETH_ALEN];
+ /* mac address per port*/
+ union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */
+ u8 initialized : 1; /* ib dev CQ, evthdl done */
+ struct work_struct port_event_work;
+ unsigned long port_event_mask;
+};
+
+struct smc_buf_desc;
+struct smc_link;
+
+int smc_ib_register_client(void) __init;
+void smc_ib_unregister_client(void);
+bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
+int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport);
+int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction);
+void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int bufsize,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction);
+void smc_ib_dealloc_protection_domain(struct smc_link *lnk);
+int smc_ib_create_protection_domain(struct smc_link *lnk);
+void smc_ib_destroy_queue_pair(struct smc_link *lnk);
+int smc_ib_create_queue_pair(struct smc_link *lnk);
+int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
+ struct ib_mr **mr);
+int smc_ib_ready_link(struct smc_link *lnk);
+int smc_ib_modify_qp_rts(struct smc_link *lnk);
+int smc_ib_modify_qp_reset(struct smc_link *lnk);
+long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
+
+
+#endif
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
new file mode 100644
index 000000000000..c2f9165d13ef
--- /dev/null
+++ b/net/smc/smc_llc.c
@@ -0,0 +1,158 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Link Layer Control (LLC)
+ *
+ * For now, we only support the necessary "confirm link" functionality
+ * which happens for the first RoCE link after successful CLC handshake.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
+ * Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <net/tcp.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_clc.h"
+#include "smc_llc.h"
+
+/********************************** send *************************************/
+
+struct smc_llc_tx_pend {
+};
+
+/* handler for send/transmission completion of an LLC msg */
+static void smc_llc_tx_handler(struct smc_wr_tx_pend_priv *pend,
+ struct smc_link *link,
+ enum ib_wc_status wc_status)
+{
+ /* future work: handle wc_status error for recovery and failover */
+}
+
+/**
+ * smc_llc_add_pending_send() - add LLC control message to pending WQE transmits
+ * @link: Pointer to SMC link used for sending LLC control message.
+ * @wr_buf: Out variable returning pointer to work request payload buffer.
+ * @pend: Out variable returning pointer to private pending WR tracking.
+ * It's the context the transmit complete handler will get.
+ *
+ * Reserves and pre-fills an entry for a pending work request send/tx.
+ * Used by mid-level smc_llc_send_msg() to prepare for later actual send/tx.
+ * Can sleep due to smc_get_ctrl_buf (if not in softirq context).
+ *
+ * Return: 0 on success, otherwise an error value.
+ */
+static int smc_llc_add_pending_send(struct smc_link *link,
+ struct smc_wr_buf **wr_buf,
+ struct smc_wr_tx_pend_priv **pend)
+{
+ int rc;
+
+ rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, pend);
+ if (rc < 0)
+ return rc;
+ BUILD_BUG_ON_MSG(
+ sizeof(union smc_llc_msg) > SMC_WR_BUF_SIZE,
+ "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_llc_msg)");
+ BUILD_BUG_ON_MSG(
+ sizeof(union smc_llc_msg) != SMC_WR_TX_SIZE,
+ "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_llc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
+ BUILD_BUG_ON_MSG(
+ sizeof(struct smc_llc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
+ "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_llc_tx_pend)");
+ return 0;
+}
+
+/* high-level API to send LLC confirm link */
+int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
+ union ib_gid *gid,
+ enum smc_llc_reqresp reqresp)
+{
+ struct smc_link_group *lgr = container_of(link, struct smc_link_group,
+ lnk[SMC_SINGLE_LINK]);
+ struct smc_llc_msg_confirm_link *confllc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ return rc;
+ confllc = (struct smc_llc_msg_confirm_link *)wr_buf;
+ memset(confllc, 0, sizeof(*confllc));
+ confllc->hd.common.type = SMC_LLC_CONFIRM_LINK;
+ confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link);
+ if (reqresp == SMC_LLC_RESP)
+ confllc->hd.flags |= SMC_LLC_FLAG_RESP;
+ memcpy(confllc->sender_mac, mac, ETH_ALEN);
+ memcpy(confllc->sender_gid, gid, SMC_GID_SIZE);
+ hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
+ /* confllc->link_num = SMC_SINGLE_LINK; already done by memset above */
+ memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE);
+ confllc->max_links = SMC_LINKS_PER_LGR_MAX;
+ /* send llc message */
+ rc = smc_wr_tx_send(link, pend);
+ return rc;
+}
+
+/********************************* receive ***********************************/
+
+static void smc_llc_rx_confirm_link(struct smc_link *link,
+ struct smc_llc_msg_confirm_link *llc)
+{
+ struct smc_link_group *lgr;
+
+ lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+ if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
+ if (lgr->role == SMC_SERV)
+ complete(&link->llc_confirm_resp);
+ } else {
+ if (lgr->role == SMC_CLNT) {
+ link->link_id = llc->link_num;
+ complete(&link->llc_confirm);
+ }
+ }
+}
+
+static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
+{
+ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+ union smc_llc_msg *llc = buf;
+
+ if (wc->byte_len < sizeof(*llc))
+ return; /* short message */
+ if (llc->raw.hdr.length != sizeof(*llc))
+ return; /* invalid message */
+ if (llc->raw.hdr.common.type == SMC_LLC_CONFIRM_LINK)
+ smc_llc_rx_confirm_link(link, &llc->confirm_link);
+}
+
+/***************************** init, exit, misc ******************************/
+
+static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_CONFIRM_LINK
+ },
+ {
+ .handler = NULL,
+ }
+};
+
+int __init smc_llc_init(void)
+{
+ struct smc_wr_rx_handler *handler;
+ int rc = 0;
+
+ for (handler = smc_llc_rx_handlers; handler->handler; handler++) {
+ INIT_HLIST_NODE(&handler->list);
+ rc = smc_wr_rx_register_handler(handler);
+ if (rc)
+ break;
+ }
+ return rc;
+}
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
new file mode 100644
index 000000000000..b472f853953a
--- /dev/null
+++ b/net/smc/smc_llc.h
@@ -0,0 +1,63 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for LLC (link layer control) message handling
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
+ * Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_LLC_H
+#define SMC_LLC_H
+
+#include "smc_wr.h"
+
+#define SMC_LLC_FLAG_RESP 0x80
+
+#define SMC_LLC_WAIT_FIRST_TIME (5 * HZ)
+
+enum smc_llc_reqresp {
+ SMC_LLC_REQ,
+ SMC_LLC_RESP
+};
+
+enum smc_llc_msg_type {
+ SMC_LLC_CONFIRM_LINK = 0x01,
+};
+
+#define SMC_LLC_DATA_LEN 40
+
+struct smc_llc_hdr {
+ struct smc_wr_rx_hdr common;
+ u8 length; /* 44 */
+ u8 reserved;
+ u8 flags;
+};
+
+struct smc_llc_msg_confirm_link { /* type 0x01 */
+ struct smc_llc_hdr hd;
+ u8 sender_mac[ETH_ALEN];
+ u8 sender_gid[SMC_GID_SIZE];
+ u8 sender_qp_num[3];
+ u8 link_num;
+ u8 link_uid[SMC_LGR_ID_SIZE];
+ u8 max_links;
+ u8 reserved[9];
+};
+
+union smc_llc_msg {
+ struct smc_llc_msg_confirm_link confirm_link;
+ struct {
+ struct smc_llc_hdr hdr;
+ u8 data[SMC_LLC_DATA_LEN];
+ } raw;
+};
+
+/* transmit */
+int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid,
+ enum smc_llc_reqresp reqresp);
+int smc_llc_init(void) __init;
+
+#endif /* SMC_LLC_H */
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
new file mode 100644
index 000000000000..9d3e7fb8348d
--- /dev/null
+++ b/net/smc/smc_pnet.c
@@ -0,0 +1,534 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Generic netlink support functions to configure an SMC-R PNET table
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/ctype.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/if.h>
+#include <uapi/linux/smc.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "smc_pnet.h"
+#include "smc_ib.h"
+
+#define SMC_MAX_PNET_ID_LEN 16 /* Max. length of PNET id */
+
+static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
+ [SMC_PNETID_NAME] = {
+ .type = NLA_NUL_STRING,
+ .len = SMC_MAX_PNET_ID_LEN - 1
+ },
+ [SMC_PNETID_ETHNAME] = {
+ .type = NLA_NUL_STRING,
+ .len = IFNAMSIZ - 1
+ },
+ [SMC_PNETID_IBNAME] = {
+ .type = NLA_NUL_STRING,
+ .len = IB_DEVICE_NAME_MAX - 1
+ },
+ [SMC_PNETID_IBPORT] = { .type = NLA_U8 }
+};
+
+static struct genl_family smc_pnet_nl_family;
+
+/**
+ * struct smc_pnettable - SMC PNET table anchor
+ * @lock: Lock for list action
+ * @pnetlist: List of PNETIDs
+ */
+static struct smc_pnettable {
+ rwlock_t lock;
+ struct list_head pnetlist;
+} smc_pnettable = {
+ .pnetlist = LIST_HEAD_INIT(smc_pnettable.pnetlist),
+ .lock = __RW_LOCK_UNLOCKED(smc_pnettable.lock)
+};
+
+/**
+ * struct smc_pnetentry - pnet identifier name entry
+ * @list: List node.
+ * @pnet_name: Pnet identifier name
+ * @ndev: pointer to network device.
+ * @smcibdev: Pointer to IB device.
+ */
+struct smc_pnetentry {
+ struct list_head list;
+ char pnet_name[SMC_MAX_PNET_ID_LEN + 1];
+ struct net_device *ndev;
+ struct smc_ib_device *smcibdev;
+ u8 ib_port;
+};
+
+/* Check if two RDMA device entries are identical. Use device name and port
+ * number for comparison.
+ */
+static bool smc_pnet_same_ibname(struct smc_pnetentry *pnetelem, char *ibname,
+ u8 ibport)
+{
+ return pnetelem->ib_port == ibport &&
+ !strncmp(pnetelem->smcibdev->ibdev->name, ibname,
+ sizeof(pnetelem->smcibdev->ibdev->name));
+}
+
+/* Find a pnetid in the pnet table.
+ */
+static struct smc_pnetentry *smc_pnet_find_pnetid(char *pnet_name)
+{
+ struct smc_pnetentry *pnetelem, *found_pnetelem = NULL;
+
+ read_lock(&smc_pnettable.lock);
+ list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+ if (!strncmp(pnetelem->pnet_name, pnet_name,
+ sizeof(pnetelem->pnet_name))) {
+ found_pnetelem = pnetelem;
+ break;
+ }
+ }
+ read_unlock(&smc_pnettable.lock);
+ return found_pnetelem;
+}
+
+/* Remove a pnetid from the pnet table.
+ */
+static int smc_pnet_remove_by_pnetid(char *pnet_name)
+{
+ struct smc_pnetentry *pnetelem, *tmp_pe;
+ int rc = -ENOENT;
+
+ write_lock(&smc_pnettable.lock);
+ list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+ list) {
+ if (!strncmp(pnetelem->pnet_name, pnet_name,
+ sizeof(pnetelem->pnet_name))) {
+ list_del(&pnetelem->list);
+ dev_put(pnetelem->ndev);
+ kfree(pnetelem);
+ rc = 0;
+ break;
+ }
+ }
+ write_unlock(&smc_pnettable.lock);
+ return rc;
+}
+
+/* Remove a pnet entry mentioning a given network device from the pnet table.
+ */
+static int smc_pnet_remove_by_ndev(struct net_device *ndev)
+{
+ struct smc_pnetentry *pnetelem, *tmp_pe;
+ int rc = -ENOENT;
+
+ write_lock(&smc_pnettable.lock);
+ list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+ list) {
+ if (pnetelem->ndev == ndev) {
+ list_del(&pnetelem->list);
+ dev_put(pnetelem->ndev);
+ kfree(pnetelem);
+ rc = 0;
+ break;
+ }
+ }
+ write_unlock(&smc_pnettable.lock);
+ return rc;
+}
+
+/* Remove a pnet entry mentioning a given ib device from the pnet table.
+ */
+int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev)
+{
+ struct smc_pnetentry *pnetelem, *tmp_pe;
+ int rc = -ENOENT;
+
+ write_lock(&smc_pnettable.lock);
+ list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+ list) {
+ if (pnetelem->smcibdev == ibdev) {
+ list_del(&pnetelem->list);
+ dev_put(pnetelem->ndev);
+ kfree(pnetelem);
+ rc = 0;
+ break;
+ }
+ }
+ write_unlock(&smc_pnettable.lock);
+ return rc;
+}
+
+/* Append a pnetid to the end of the pnet table if not already on this list.
+ */
+static int smc_pnet_enter(struct smc_pnetentry *new_pnetelem)
+{
+ struct smc_pnetentry *pnetelem;
+ int rc = -EEXIST;
+
+ write_lock(&smc_pnettable.lock);
+ list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+ if (!strncmp(pnetelem->pnet_name, new_pnetelem->pnet_name,
+ sizeof(new_pnetelem->pnet_name)) ||
+ !strncmp(pnetelem->ndev->name, new_pnetelem->ndev->name,
+ sizeof(new_pnetelem->ndev->name)) ||
+ smc_pnet_same_ibname(pnetelem,
+ new_pnetelem->smcibdev->ibdev->name,
+ new_pnetelem->ib_port))
+ goto found;
+ }
+ list_add_tail(&new_pnetelem->list, &smc_pnettable.pnetlist);
+ rc = 0;
+found:
+ write_unlock(&smc_pnettable.lock);
+ return rc;
+}
+
+/* The limit for pnetid is 16 characters.
+ * Valid characters should be (single-byte character set) a-z, A-Z, 0-9.
+ * Lower case letters are converted to upper case.
+ * Interior blanks should not be used.
+ */
+static bool smc_pnetid_valid(const char *pnet_name, char *pnetid)
+{
+ char *bf = skip_spaces(pnet_name);
+ size_t len = strlen(bf);
+ char *end = bf + len;
+
+ if (!len)
+ return false;
+ while (--end >= bf && isspace(*end))
+ ;
+ if (end - bf >= SMC_MAX_PNET_ID_LEN)
+ return false;
+ while (bf <= end) {
+ if (!isalnum(*bf))
+ return false;
+ *pnetid++ = islower(*bf) ? toupper(*bf) : *bf;
+ bf++;
+ }
+ *pnetid = '\0';
+ return true;
+}
+
+/* Find an infiniband device by a given name. The device might not exist. */
+struct smc_ib_device *smc_pnet_find_ib(char *ib_name)
+{
+ struct smc_ib_device *ibdev;
+
+ spin_lock(&smc_ib_devices.lock);
+ list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+ if (!strncmp(ibdev->ibdev->name, ib_name,
+ sizeof(ibdev->ibdev->name))) {
+ goto out;
+ }
+ }
+ ibdev = NULL;
+out:
+ spin_unlock(&smc_ib_devices.lock);
+ return ibdev;
+}
+
+/* Parse the supplied netlink attributes and fill a pnetentry structure.
+ * For ethernet and infiniband device names verify that the devices exist.
+ */
+static int smc_pnet_fill_entry(struct net *net, struct smc_pnetentry *pnetelem,
+ struct nlattr *tb[])
+{
+ char *string, *ibname = NULL;
+ int rc = 0;
+
+ memset(pnetelem, 0, sizeof(*pnetelem));
+ INIT_LIST_HEAD(&pnetelem->list);
+ if (tb[SMC_PNETID_NAME]) {
+ string = (char *)nla_data(tb[SMC_PNETID_NAME]);
+ if (!smc_pnetid_valid(string, pnetelem->pnet_name)) {
+ rc = -EINVAL;
+ goto error;
+ }
+ }
+ if (tb[SMC_PNETID_ETHNAME]) {
+ string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
+ pnetelem->ndev = dev_get_by_name(net, string);
+ if (!pnetelem->ndev)
+ return -ENOENT;
+ }
+ if (tb[SMC_PNETID_IBNAME]) {
+ ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
+ ibname = strim(ibname);
+ pnetelem->smcibdev = smc_pnet_find_ib(ibname);
+ if (!pnetelem->smcibdev) {
+ rc = -ENOENT;
+ goto error;
+ }
+ }
+ if (tb[SMC_PNETID_IBPORT]) {
+ pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]);
+ if (pnetelem->ib_port > SMC_MAX_PORTS) {
+ rc = -EINVAL;
+ goto error;
+ }
+ }
+ return 0;
+
+error:
+ if (pnetelem->ndev)
+ dev_put(pnetelem->ndev);
+ return rc;
+}
+
+/* Convert an smc_pnetentry to a netlink attribute sequence */
+static int smc_pnet_set_nla(struct sk_buff *msg, struct smc_pnetentry *pnetelem)
+{
+ if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name) ||
+ nla_put_string(msg, SMC_PNETID_ETHNAME, pnetelem->ndev->name) ||
+ nla_put_string(msg, SMC_PNETID_IBNAME,
+ pnetelem->smcibdev->ibdev->name) ||
+ nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port))
+ return -1;
+ return 0;
+}
+
+/* Retrieve one PNETID entry */
+static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct smc_pnetentry *pnetelem;
+ struct sk_buff *msg;
+ void *hdr;
+ int rc;
+
+ pnetelem = smc_pnet_find_pnetid(
+ (char *)nla_data(info->attrs[SMC_PNETID_NAME]));
+ if (!pnetelem)
+ return -ENOENT;
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &smc_pnet_nl_family, 0, SMC_PNETID_GET);
+ if (!hdr) {
+ rc = -EMSGSIZE;
+ goto err_out;
+ }
+
+ if (smc_pnet_set_nla(msg, pnetelem)) {
+ rc = -ENOBUFS;
+ goto err_out;
+ }
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+err_out:
+ nlmsg_free(msg);
+ return rc;
+}
+
+static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct smc_pnetentry *pnetelem;
+ int rc;
+
+ pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL);
+ if (!pnetelem)
+ return -ENOMEM;
+ rc = smc_pnet_fill_entry(net, pnetelem, info->attrs);
+ if (!rc)
+ rc = smc_pnet_enter(pnetelem);
+ if (rc) {
+ kfree(pnetelem);
+ return rc;
+ }
+ rc = smc_ib_remember_port_attr(pnetelem->smcibdev, pnetelem->ib_port);
+ if (rc)
+ smc_pnet_remove_by_pnetid(pnetelem->pnet_name);
+ return rc;
+}
+
+static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info)
+{
+ return smc_pnet_remove_by_pnetid(
+ (char *)nla_data(info->attrs[SMC_PNETID_NAME]));
+}
+
+static int smc_pnet_dump_start(struct netlink_callback *cb)
+{
+ cb->args[0] = 0;
+ return 0;
+}
+
+static int smc_pnet_dumpinfo(struct sk_buff *skb,
+ u32 portid, u32 seq, u32 flags,
+ struct smc_pnetentry *pnetelem)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family,
+ flags, SMC_PNETID_GET);
+ if (!hdr)
+ return -ENOMEM;
+ if (smc_pnet_set_nla(skb, pnetelem) < 0) {
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+ }
+ genlmsg_end(skb, hdr);
+ return 0;
+}
+
+static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct smc_pnetentry *pnetelem;
+ int idx = 0;
+
+ read_lock(&smc_pnettable.lock);
+ list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+ if (idx++ < cb->args[0])
+ continue;
+ if (smc_pnet_dumpinfo(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ pnetelem)) {
+ --idx;
+ break;
+ }
+ }
+ cb->args[0] = idx;
+ read_unlock(&smc_pnettable.lock);
+ return skb->len;
+}
+
+/* Remove and delete all pnetids from pnet table.
+ */
+static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info)
+{
+ struct smc_pnetentry *pnetelem, *tmp_pe;
+
+ write_lock(&smc_pnettable.lock);
+ list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+ list) {
+ list_del(&pnetelem->list);
+ dev_put(pnetelem->ndev);
+ kfree(pnetelem);
+ }
+ write_unlock(&smc_pnettable.lock);
+ return 0;
+}
+
+/* SMC_PNETID generic netlink operation definition */
+static const struct genl_ops smc_pnet_ops[] = {
+ {
+ .cmd = SMC_PNETID_GET,
+ .flags = GENL_ADMIN_PERM,
+ .policy = smc_pnet_policy,
+ .doit = smc_pnet_get,
+ .dumpit = smc_pnet_dump,
+ .start = smc_pnet_dump_start
+ },
+ {
+ .cmd = SMC_PNETID_ADD,
+ .flags = GENL_ADMIN_PERM,
+ .policy = smc_pnet_policy,
+ .doit = smc_pnet_add
+ },
+ {
+ .cmd = SMC_PNETID_DEL,
+ .flags = GENL_ADMIN_PERM,
+ .policy = smc_pnet_policy,
+ .doit = smc_pnet_del
+ },
+ {
+ .cmd = SMC_PNETID_FLUSH,
+ .flags = GENL_ADMIN_PERM,
+ .policy = smc_pnet_policy,
+ .doit = smc_pnet_flush
+ }
+};
+
+/* SMC_PNETID family definition */
+static struct genl_family smc_pnet_nl_family = {
+ .hdrsize = 0,
+ .name = SMCR_GENL_FAMILY_NAME,
+ .version = SMCR_GENL_FAMILY_VERSION,
+ .maxattr = SMC_PNETID_MAX,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = smc_pnet_ops,
+ .n_ops = ARRAY_SIZE(smc_pnet_ops)
+};
+
+static int smc_pnet_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+ switch (event) {
+ case NETDEV_REBOOT:
+ case NETDEV_UNREGISTER:
+ smc_pnet_remove_by_ndev(event_dev);
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block smc_netdev_notifier = {
+ .notifier_call = smc_pnet_netdev_event
+};
+
+int __init smc_pnet_init(void)
+{
+ int rc;
+
+ rc = genl_register_family(&smc_pnet_nl_family);
+ if (rc)
+ return rc;
+ rc = register_netdevice_notifier(&smc_netdev_notifier);
+ if (rc)
+ genl_unregister_family(&smc_pnet_nl_family);
+ return rc;
+}
+
+void smc_pnet_exit(void)
+{
+ smc_pnet_flush(NULL, NULL);
+ unregister_netdevice_notifier(&smc_netdev_notifier);
+ genl_unregister_family(&smc_pnet_nl_family);
+}
+
+/* PNET table analysis for a given sock:
+ * determine ib_device and port belonging to used internal TCP socket
+ * ethernet interface.
+ */
+void smc_pnet_find_roce_resource(struct sock *sk,
+ struct smc_ib_device **smcibdev, u8 *ibport)
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+ struct smc_pnetentry *pnetelem;
+
+ *smcibdev = NULL;
+ *ibport = 0;
+
+ if (!dst)
+ return;
+ if (!dst->dev)
+ goto out_rel;
+ read_lock(&smc_pnettable.lock);
+ list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+ if (dst->dev == pnetelem->ndev) {
+ *smcibdev = pnetelem->smcibdev;
+ *ibport = pnetelem->ib_port;
+ break;
+ }
+ }
+ read_unlock(&smc_pnettable.lock);
+out_rel:
+ dst_release(dst);
+}
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
new file mode 100644
index 000000000000..32ab3df928ca
--- /dev/null
+++ b/net/smc/smc_pnet.h
@@ -0,0 +1,23 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * PNET table queries
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_PNET_H
+#define _SMC_PNET_H
+
+struct smc_ib_device;
+
+int smc_pnet_init(void) __init;
+void smc_pnet_exit(void);
+int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev);
+struct smc_ib_device *smc_pnet_find_ib(char *ib_name);
+void smc_pnet_find_roce_resource(struct sock *sk,
+ struct smc_ib_device **smcibdev, u8 *ibport);
+
+#endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
new file mode 100644
index 000000000000..c4ef9a4ec569
--- /dev/null
+++ b/net/smc/smc_rx.c
@@ -0,0 +1,219 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage RMBE
+ * copy new RMBE data into user space
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/signal.h>
+
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_cdc.h"
+#include "smc_tx.h" /* smc_tx_consumer_update() */
+#include "smc_rx.h"
+
+/* callback implementation for sk.sk_data_ready()
+ * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data().
+ * indirectly called by smc_cdc_msg_recv_action().
+ */
+static void smc_rx_data_ready(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ /* derived from sock_def_readable() */
+ /* called already in smc_listen_work() */
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
+ POLLRDNORM | POLLRDBAND);
+ if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
+ (sk->sk_state == SMC_CLOSED))
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+ else
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ rcu_read_unlock();
+}
+
+/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
+ * @smc smc socket
+ * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout
+ * Returns:
+ * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
+ * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
+ */
+static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ int rc;
+
+ if (atomic_read(&conn->bytes_to_rcv))
+ return 1;
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ add_wait_queue(sk_sleep(sk), &wait);
+ rc = sk_wait_event(sk, timeo,
+ sk->sk_err ||
+ sk->sk_shutdown & RCV_SHUTDOWN ||
+ sock_flag(sk, SOCK_DONE) ||
+ atomic_read(&conn->bytes_to_rcv) ||
+ smc_cdc_rxed_any_close_or_senddone(conn),
+ &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ return rc;
+}
+
+/* rcvbuf consumer: main API called by socket layer.
+ * called under sk lock.
+ */
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
+ int flags)
+{
+ size_t copylen, read_done = 0, read_remaining = len;
+ size_t chunk_len, chunk_off, chunk_len_sum;
+ struct smc_connection *conn = &smc->conn;
+ union smc_host_cursor cons;
+ int readable, chunk;
+ char *rcvbuf_base;
+ struct sock *sk;
+ long timeo;
+ int target; /* Read at least these many bytes */
+ int rc;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return -EINVAL; /* future work for sk.sk_family == AF_SMC */
+ if (flags & MSG_OOB)
+ return -EINVAL; /* future work */
+
+ sk = &smc->sk;
+ if (sk->sk_state == SMC_LISTEN)
+ return -ENOTCONN;
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+
+ msg->msg_namelen = 0;
+ /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
+ rcvbuf_base = conn->rmb_desc->cpu_addr;
+
+ do { /* while (read_remaining) */
+ if (read_done >= target)
+ break;
+
+ if (atomic_read(&conn->bytes_to_rcv))
+ goto copy;
+
+ if (read_done) {
+ if (sk->sk_err ||
+ sk->sk_state == SMC_CLOSED ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ !timeo ||
+ signal_pending(current) ||
+ smc_cdc_rxed_any_close_or_senddone(conn) ||
+ conn->local_tx_ctrl.conn_state_flags.
+ peer_conn_abort)
+ break;
+ } else {
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+ if (sk->sk_err) {
+ read_done = sock_error(sk);
+ break;
+ }
+ if (sk->sk_shutdown & RCV_SHUTDOWN ||
+ smc_cdc_rxed_any_close_or_senddone(conn) ||
+ conn->local_tx_ctrl.conn_state_flags.
+ peer_conn_abort)
+ break;
+ if (sk->sk_state == SMC_CLOSED) {
+ if (!sock_flag(sk, SOCK_DONE)) {
+ /* This occurs when user tries to read
+ * from never connected socket.
+ */
+ read_done = -ENOTCONN;
+ break;
+ }
+ break;
+ }
+ if (signal_pending(current)) {
+ read_done = sock_intr_errno(timeo);
+ break;
+ }
+ }
+
+ if (!atomic_read(&conn->bytes_to_rcv)) {
+ smc_rx_wait_data(smc, &timeo);
+ continue;
+ }
+
+copy:
+ /* initialize variables for 1st iteration of subsequent loop */
+ /* could be just 1 byte, even after smc_rx_wait_data above */
+ readable = atomic_read(&conn->bytes_to_rcv);
+ /* not more than what user space asked for */
+ copylen = min_t(size_t, read_remaining, readable);
+ smc_curs_write(&cons,
+ smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+ conn);
+ /* determine chunks where to read from rcvbuf */
+ /* either unwrapped case, or 1st chunk of wrapped case */
+ chunk_len = min_t(size_t,
+ copylen, conn->rmbe_size - cons.count);
+ chunk_len_sum = chunk_len;
+ chunk_off = cons.count;
+ for (chunk = 0; chunk < 2; chunk++) {
+ if (!(flags & MSG_TRUNC)) {
+ rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off,
+ chunk_len);
+ if (rc) {
+ if (!read_done)
+ read_done = -EFAULT;
+ goto out;
+ }
+ }
+ read_remaining -= chunk_len;
+ read_done += chunk_len;
+
+ if (chunk_len_sum == copylen)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ chunk_len = copylen - chunk_len; /* remainder */
+ chunk_len_sum += chunk_len;
+ chunk_off = 0; /* modulo offset in recv ring buffer */
+ }
+
+ /* update cursors */
+ if (!(flags & MSG_PEEK)) {
+ smc_curs_add(conn->rmbe_size, &cons, copylen);
+ /* increased in recv tasklet smc_cdc_msg_rcv() */
+ smp_mb__before_atomic();
+ atomic_sub(copylen, &conn->bytes_to_rcv);
+ /* guarantee 0 <= bytes_to_rcv <= rmbe_size */
+ smp_mb__after_atomic();
+ smc_curs_write(&conn->local_tx_ctrl.cons,
+ smc_curs_read(&cons, conn),
+ conn);
+ /* send consumer cursor update if required */
+ /* similar to advertising new TCP rcv_wnd if required */
+ smc_tx_consumer_update(conn);
+ }
+ } while (read_remaining);
+out:
+ return read_done;
+}
+
+/* Initialize receive properties on connection establishment. NB: not __init! */
+void smc_rx_init(struct smc_sock *smc)
+{
+ smc->sk.sk_data_ready = smc_rx_data_ready;
+}
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
new file mode 100644
index 000000000000..b5b80e1f8b0f
--- /dev/null
+++ b/net/smc/smc_rx.h
@@ -0,0 +1,23 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage RMBE
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_RX_H
+#define SMC_RX_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+#include "smc.h"
+
+void smc_rx_init(struct smc_sock *smc);
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
+ int flags);
+
+#endif /* SMC_RX_H */
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
new file mode 100644
index 000000000000..69a0013dd25c
--- /dev/null
+++ b/net/smc/smc_tx.c
@@ -0,0 +1,485 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage send buffer.
+ * Producer:
+ * Copy user space data into send buffer, if send buffer space available.
+ * Consumer:
+ * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <linux/sched/signal.h>
+
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+#include "smc_cdc.h"
+#include "smc_tx.h"
+
+/***************************** sndbuf producer *******************************/
+
+/* callback implementation for sk.sk_write_space()
+ * to wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+ * called under sk_socket lock.
+ */
+static void smc_tx_write_space(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+ struct smc_sock *smc = smc_sk(sk);
+ struct socket_wq *wq;
+
+ /* similar to sk_stream_write_space */
+ if (atomic_read(&smc->conn.sndbuf_space) && sock) {
+ clear_bit(SOCK_NOSPACE, &sock->flags);
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_poll(&wq->wait,
+ POLLOUT | POLLWRNORM |
+ POLLWRBAND);
+ if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
+ sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
+ rcu_read_unlock();
+ }
+}
+
+/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+ * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
+ */
+void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
+{
+ if (smc->sk.sk_socket &&
+ test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags))
+ smc->sk.sk_write_space(&smc->sk);
+}
+
+/* blocks sndbuf producer until at least one byte of free space available */
+static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ bool noblock;
+ long timeo;
+ int rc = 0;
+
+ /* similar to sk_stream_wait_memory */
+ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ noblock = timeo ? false : true;
+ add_wait_queue(sk_sleep(sk), &wait);
+ while (1) {
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ if (sk->sk_err ||
+ (sk->sk_shutdown & SEND_SHUTDOWN) ||
+ conn->local_tx_ctrl.conn_state_flags.peer_done_writing) {
+ rc = -EPIPE;
+ break;
+ }
+ if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
+ rc = -ECONNRESET;
+ break;
+ }
+ if (!timeo) {
+ if (noblock)
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ rc = -EAGAIN;
+ break;
+ }
+ if (signal_pending(current)) {
+ rc = sock_intr_errno(timeo);
+ break;
+ }
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ if (atomic_read(&conn->sndbuf_space))
+ break; /* at least 1 byte of free space available */
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ sk->sk_write_pending++;
+ sk_wait_event(sk, &timeo,
+ sk->sk_err ||
+ (sk->sk_shutdown & SEND_SHUTDOWN) ||
+ smc_cdc_rxed_any_close_or_senddone(conn) ||
+ atomic_read(&conn->sndbuf_space),
+ &wait);
+ sk->sk_write_pending--;
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return rc;
+}
+
+/* sndbuf producer: main API called by socket layer.
+ * called under sock lock.
+ */
+int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
+{
+ size_t copylen, send_done = 0, send_remaining = len;
+ size_t chunk_len, chunk_off, chunk_len_sum;
+ struct smc_connection *conn = &smc->conn;
+ union smc_host_cursor prep;
+ struct sock *sk = &smc->sk;
+ char *sndbuf_base;
+ int tx_cnt_prep;
+ int writespace;
+ int rc, chunk;
+
+ /* This should be in poll */
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
+ rc = -EPIPE;
+ goto out_err;
+ }
+
+ while (msg_data_left(msg)) {
+ if (sk->sk_state == SMC_INIT)
+ return -ENOTCONN;
+ if (smc->sk.sk_shutdown & SEND_SHUTDOWN ||
+ (smc->sk.sk_err == ECONNABORTED) ||
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort)
+ return -EPIPE;
+ if (smc_cdc_rxed_any_close(conn))
+ return send_done ?: -ECONNRESET;
+
+ if (!atomic_read(&conn->sndbuf_space)) {
+ rc = smc_tx_wait_memory(smc, msg->msg_flags);
+ if (rc) {
+ if (send_done)
+ return send_done;
+ goto out_err;
+ }
+ continue;
+ }
+
+ /* initialize variables for 1st iteration of subsequent loop */
+ /* could be just 1 byte, even after smc_tx_wait_memory above */
+ writespace = atomic_read(&conn->sndbuf_space);
+ /* not more than what user space asked for */
+ copylen = min_t(size_t, send_remaining, writespace);
+ /* determine start of sndbuf */
+ sndbuf_base = conn->sndbuf_desc->cpu_addr;
+ smc_curs_write(&prep,
+ smc_curs_read(&conn->tx_curs_prep, conn),
+ conn);
+ tx_cnt_prep = prep.count;
+ /* determine chunks where to write into sndbuf */
+ /* either unwrapped case, or 1st chunk of wrapped case */
+ chunk_len = min_t(size_t,
+ copylen, conn->sndbuf_size - tx_cnt_prep);
+ chunk_len_sum = chunk_len;
+ chunk_off = tx_cnt_prep;
+ for (chunk = 0; chunk < 2; chunk++) {
+ rc = memcpy_from_msg(sndbuf_base + chunk_off,
+ msg, chunk_len);
+ if (rc) {
+ if (send_done)
+ return send_done;
+ goto out_err;
+ }
+ send_done += chunk_len;
+ send_remaining -= chunk_len;
+
+ if (chunk_len_sum == copylen)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ chunk_len = copylen - chunk_len; /* remainder */
+ chunk_len_sum += chunk_len;
+ chunk_off = 0; /* modulo offset in send ring buffer */
+ }
+ /* update cursors */
+ smc_curs_add(conn->sndbuf_size, &prep, copylen);
+ smc_curs_write(&conn->tx_curs_prep,
+ smc_curs_read(&prep, conn),
+ conn);
+ /* increased in send tasklet smc_cdc_tx_handler() */
+ smp_mb__before_atomic();
+ atomic_sub(copylen, &conn->sndbuf_space);
+ /* guarantee 0 <= sndbuf_space <= sndbuf_size */
+ smp_mb__after_atomic();
+ /* since we just produced more new data into sndbuf,
+ * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
+ */
+ smc_tx_sndbuf_nonempty(conn);
+ } /* while (msg_data_left(msg)) */
+
+ return send_done;
+
+out_err:
+ rc = sk_stream_error(sk, msg->msg_flags, rc);
+ /* make sure we wake any epoll edge trigger waiter */
+ if (unlikely(rc == -EAGAIN))
+ sk->sk_write_space(sk);
+ return rc;
+}
+
+/***************************** sndbuf consumer *******************************/
+
+/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */
+static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
+ int num_sges, struct ib_sge sges[])
+{
+ struct smc_link_group *lgr = conn->lgr;
+ struct ib_send_wr *failed_wr = NULL;
+ struct ib_rdma_wr rdma_wr;
+ struct smc_link *link;
+ int rc;
+
+ memset(&rdma_wr, 0, sizeof(rdma_wr));
+ link = &lgr->lnk[SMC_SINGLE_LINK];
+ rdma_wr.wr.wr_id = smc_wr_tx_get_next_wr_id(link);
+ rdma_wr.wr.sg_list = sges;
+ rdma_wr.wr.num_sge = num_sges;
+ rdma_wr.wr.opcode = IB_WR_RDMA_WRITE;
+ rdma_wr.remote_addr =
+ lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr +
+ /* RMBE within RMB */
+ ((conn->peer_conn_idx - 1) * conn->peer_rmbe_size) +
+ /* offset within RMBE */
+ peer_rmbe_offset;
+ rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
+ rc = ib_post_send(link->roce_qp, &rdma_wr.wr, &failed_wr);
+ if (rc)
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ return rc;
+}
+
+/* sndbuf consumer */
+static inline void smc_tx_advance_cursors(struct smc_connection *conn,
+ union smc_host_cursor *prod,
+ union smc_host_cursor *sent,
+ size_t len)
+{
+ smc_curs_add(conn->peer_rmbe_size, prod, len);
+ /* increased in recv tasklet smc_cdc_msg_rcv() */
+ smp_mb__before_atomic();
+ /* data in flight reduces usable snd_wnd */
+ atomic_sub(len, &conn->peer_rmbe_space);
+ /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
+ smp_mb__after_atomic();
+ smc_curs_add(conn->sndbuf_size, sent, len);
+}
+
+/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
+ * usable snd_wnd as max transmit
+ */
+static int smc_tx_rdma_writes(struct smc_connection *conn)
+{
+ size_t src_off, src_len, dst_off, dst_len; /* current chunk values */
+ size_t len, dst_len_sum, src_len_sum, dstchunk, srcchunk;
+ union smc_host_cursor sent, prep, prod, cons;
+ struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
+ struct smc_link_group *lgr = conn->lgr;
+ int to_send, rmbespace;
+ struct smc_link *link;
+ int num_sges;
+ int rc;
+
+ /* source: sndbuf */
+ smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
+ smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
+ /* cf. wmem_alloc - (snd_max - snd_una) */
+ to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep);
+ if (to_send <= 0)
+ return 0;
+
+ /* destination: RMBE */
+ /* cf. snd_wnd */
+ rmbespace = atomic_read(&conn->peer_rmbe_space);
+ if (rmbespace <= 0)
+ return 0;
+ smc_curs_write(&prod,
+ smc_curs_read(&conn->local_tx_ctrl.prod, conn),
+ conn);
+ smc_curs_write(&cons,
+ smc_curs_read(&conn->local_rx_ctrl.cons, conn),
+ conn);
+
+ /* if usable snd_wnd closes ask peer to advertise once it opens again */
+ conn->local_tx_ctrl.prod_flags.write_blocked = (to_send >= rmbespace);
+ /* cf. usable snd_wnd */
+ len = min(to_send, rmbespace);
+
+ /* initialize variables for first iteration of subsequent nested loop */
+ link = &lgr->lnk[SMC_SINGLE_LINK];
+ dst_off = prod.count;
+ if (prod.wrap == cons.wrap) {
+ /* the filled destination area is unwrapped,
+ * hence the available free destination space is wrapped
+ * and we need 2 destination chunks of sum len; start with 1st
+ * which is limited by what's available in sndbuf
+ */
+ dst_len = min_t(size_t,
+ conn->peer_rmbe_size - prod.count, len);
+ } else {
+ /* the filled destination area is wrapped,
+ * hence the available free destination space is unwrapped
+ * and we need a single destination chunk of entire len
+ */
+ dst_len = len;
+ }
+ dst_len_sum = dst_len;
+ src_off = sent.count;
+ /* dst_len determines the maximum src_len */
+ if (sent.count + dst_len <= conn->sndbuf_size) {
+ /* unwrapped src case: single chunk of entire dst_len */
+ src_len = dst_len;
+ } else {
+ /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
+ src_len = conn->sndbuf_size - sent.count;
+ }
+ src_len_sum = src_len;
+ for (dstchunk = 0; dstchunk < 2; dstchunk++) {
+ num_sges = 0;
+ for (srcchunk = 0; srcchunk < 2; srcchunk++) {
+ sges[srcchunk].addr =
+ conn->sndbuf_desc->dma_addr[SMC_SINGLE_LINK] +
+ src_off;
+ sges[srcchunk].length = src_len;
+ sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
+ num_sges++;
+ src_off += src_len;
+ if (src_off >= conn->sndbuf_size)
+ src_off -= conn->sndbuf_size;
+ /* modulo in send ring */
+ if (src_len_sum == dst_len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ src_len = dst_len - src_len; /* remainder */
+ src_len_sum += src_len;
+ }
+ rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges);
+ if (rc)
+ return rc;
+ if (dst_len_sum == len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ dst_off = 0; /* modulo offset in RMBE ring buffer */
+ dst_len = len - dst_len; /* remainder */
+ dst_len_sum += dst_len;
+ src_len = min_t(int,
+ dst_len, conn->sndbuf_size - sent.count);
+ src_len_sum = src_len;
+ }
+
+ smc_tx_advance_cursors(conn, &prod, &sent, len);
+ /* update connection's cursors with advanced local cursors */
+ smc_curs_write(&conn->local_tx_ctrl.prod,
+ smc_curs_read(&prod, conn),
+ conn);
+ /* dst: peer RMBE */
+ smc_curs_write(&conn->tx_curs_sent,
+ smc_curs_read(&sent, conn),
+ conn);
+ /* src: local sndbuf */
+
+ return 0;
+}
+
+/* Wakeup sndbuf consumers from any context (IRQ or process)
+ * since there is more data to transmit; usable snd_wnd as max transmit
+ */
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+ struct smc_cdc_tx_pend *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ spin_lock_bh(&conn->send_lock);
+ rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
+ &pend);
+ if (rc < 0) {
+ if (rc == -EBUSY) {
+ struct smc_sock *smc =
+ container_of(conn, struct smc_sock, conn);
+
+ if (smc->sk.sk_err == ECONNABORTED) {
+ rc = sock_error(&smc->sk);
+ goto out_unlock;
+ }
+ rc = 0;
+ schedule_work(&conn->tx_work);
+ }
+ goto out_unlock;
+ }
+
+ rc = smc_tx_rdma_writes(conn);
+ if (rc) {
+ smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+ (struct smc_wr_tx_pend_priv *)pend);
+ goto out_unlock;
+ }
+
+ rc = smc_cdc_msg_send(conn, wr_buf, pend);
+
+out_unlock:
+ spin_unlock_bh(&conn->send_lock);
+ return rc;
+}
+
+/* Wakeup sndbuf consumers from process context
+ * since there is more data to transmit
+ */
+static void smc_tx_work(struct work_struct *work)
+{
+ struct smc_connection *conn = container_of(work,
+ struct smc_connection,
+ tx_work);
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+
+ lock_sock(&smc->sk);
+ smc_tx_sndbuf_nonempty(conn);
+ release_sock(&smc->sk);
+}
+
+void smc_tx_consumer_update(struct smc_connection *conn)
+{
+ union smc_host_cursor cfed, cons;
+ struct smc_cdc_tx_pend *pend;
+ struct smc_wr_buf *wr_buf;
+ int to_confirm, rc;
+
+ smc_curs_write(&cons,
+ smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+ conn);
+ smc_curs_write(&cfed,
+ smc_curs_read(&conn->rx_curs_confirmed, conn),
+ conn);
+ to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons);
+
+ if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+ ((to_confirm > conn->rmbe_update_limit) &&
+ ((to_confirm > (conn->rmbe_size / 2)) ||
+ conn->local_rx_ctrl.prod_flags.write_blocked))) {
+ rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+ &wr_buf, &pend);
+ if (!rc)
+ rc = smc_cdc_msg_send(conn, wr_buf, pend);
+ if (rc < 0) {
+ schedule_work(&conn->tx_work);
+ return;
+ }
+ smc_curs_write(&conn->rx_curs_confirmed,
+ smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+ conn);
+ conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
+ }
+ if (conn->local_rx_ctrl.prod_flags.write_blocked &&
+ !atomic_read(&conn->bytes_to_rcv))
+ conn->local_rx_ctrl.prod_flags.write_blocked = 0;
+}
+
+/***************************** send initialize *******************************/
+
+/* Initialize send properties on connection establishment. NB: not __init! */
+void smc_tx_init(struct smc_sock *smc)
+{
+ smc->sk.sk_write_space = smc_tx_write_space;
+ INIT_WORK(&smc->conn.tx_work, smc_tx_work);
+ spin_lock_init(&smc->conn.send_lock);
+}
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
new file mode 100644
index 000000000000..1d6a0dcdcfe6
--- /dev/null
+++ b/net/smc/smc_tx.h
@@ -0,0 +1,35 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage send buffer
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_TX_H
+#define SMC_TX_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+#include "smc.h"
+#include "smc_cdc.h"
+
+static inline int smc_tx_prepared_sends(struct smc_connection *conn)
+{
+ union smc_host_cursor sent, prep;
+
+ smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
+ smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
+ return smc_curs_diff(conn->sndbuf_size, &sent, &prep);
+}
+
+void smc_tx_init(struct smc_sock *smc);
+int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
+void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
+void smc_tx_consumer_update(struct smc_connection *conn);
+
+#endif /* SMC_TX_H */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
new file mode 100644
index 000000000000..eadf157418dc
--- /dev/null
+++ b/net/smc/smc_wr.c
@@ -0,0 +1,614 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Work Requests exploiting Infiniband API
+ *
+ * Work requests (WR) of type ib_post_send or ib_post_recv respectively
+ * are submitted to either RC SQ or RC RQ respectively
+ * (reliably connected send/receive queue)
+ * and become work queue entries (WQEs).
+ * While an SQ WR/WQE is pending, we track it until transmission completion.
+ * Through a send or receive completion queue (CQ) respectively,
+ * we get completion queue entries (CQEs) [aka work completions (WCs)].
+ * Since the CQ callback is called from IRQ context, we split work by using
+ * bottom halves implemented by tasklets.
+ *
+ * SMC uses this to exchange LLC (link layer control)
+ * and CDC (connection data control) messages.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
+ */
+
+#include <linux/atomic.h>
+#include <linux/hashtable.h>
+#include <linux/wait.h>
+#include <rdma/ib_verbs.h>
+#include <asm/div64.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+
+#define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */
+
+#define SMC_WR_RX_HASH_BITS 4
+static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
+static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
+
+struct smc_wr_tx_pend { /* control data for a pending send request */
+ u64 wr_id; /* work request id sent */
+ smc_wr_tx_handler handler;
+ enum ib_wc_status wc_status; /* CQE status */
+ struct smc_link *link;
+ u32 idx;
+ struct smc_wr_tx_pend_priv priv;
+};
+
+/******************************** send queue *********************************/
+
+/*------------------------------- completion --------------------------------*/
+
+static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
+{
+ u32 i;
+
+ for (i = 0; i < link->wr_tx_cnt; i++) {
+ if (link->wr_tx_pends[i].wr_id == wr_id)
+ return i;
+ }
+ return link->wr_tx_cnt;
+}
+
+static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
+{
+ struct smc_wr_tx_pend pnd_snd;
+ struct smc_link *link;
+ u32 pnd_snd_idx;
+ int i;
+
+ link = wc->qp->qp_context;
+ pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
+ if (pnd_snd_idx == link->wr_tx_cnt)
+ return;
+ link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
+ memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd));
+ /* clear the full struct smc_wr_tx_pend including .priv */
+ memset(&link->wr_tx_pends[pnd_snd_idx], 0,
+ sizeof(link->wr_tx_pends[pnd_snd_idx]));
+ memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
+ sizeof(link->wr_tx_bufs[pnd_snd_idx]));
+ if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
+ return;
+ if (wc->status) {
+ struct smc_link_group *lgr;
+
+ for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
+ /* clear full struct smc_wr_tx_pend including .priv */
+ memset(&link->wr_tx_pends[i], 0,
+ sizeof(link->wr_tx_pends[i]));
+ memset(&link->wr_tx_bufs[i], 0,
+ sizeof(link->wr_tx_bufs[i]));
+ clear_bit(i, link->wr_tx_mask);
+ }
+ /* terminate connections of this link group abnormally */
+ lgr = container_of(link, struct smc_link_group,
+ lnk[SMC_SINGLE_LINK]);
+ smc_lgr_terminate(lgr);
+ }
+ if (pnd_snd.handler)
+ pnd_snd.handler(&pnd_snd.priv, link, wc->status);
+ wake_up(&link->wr_tx_wait);
+}
+
+static void smc_wr_tx_tasklet_fn(unsigned long data)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)data;
+ struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
+ int i = 0, rc;
+ int polled = 0;
+
+again:
+ polled++;
+ do {
+ rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
+ if (polled == 1) {
+ ib_req_notify_cq(dev->roce_cq_send,
+ IB_CQ_NEXT_COMP |
+ IB_CQ_REPORT_MISSED_EVENTS);
+ }
+ if (!rc)
+ break;
+ for (i = 0; i < rc; i++)
+ smc_wr_tx_process_cqe(&wc[i]);
+ } while (rc > 0);
+ if (polled == 1)
+ goto again;
+}
+
+void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
+
+ tasklet_schedule(&dev->send_tasklet);
+}
+
+/*---------------------------- request submission ---------------------------*/
+
+static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
+{
+ *idx = link->wr_tx_cnt;
+ for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
+ if (!test_and_set_bit(*idx, link->wr_tx_mask))
+ return 0;
+ }
+ *idx = link->wr_tx_cnt;
+ return -EBUSY;
+}
+
+/**
+ * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
+ * and sets info for pending transmit tracking
+ * @link: Pointer to smc_link used to later send the message.
+ * @handler: Send completion handler function pointer.
+ * @wr_buf: Out value returns pointer to message buffer.
+ * @wr_pend_priv: Out value returns pointer serving as handler context.
+ *
+ * Return: 0 on success, or -errno on error.
+ */
+int smc_wr_tx_get_free_slot(struct smc_link *link,
+ smc_wr_tx_handler handler,
+ struct smc_wr_buf **wr_buf,
+ struct smc_wr_tx_pend_priv **wr_pend_priv)
+{
+ struct smc_wr_tx_pend *wr_pend;
+ struct ib_send_wr *wr_ib;
+ u64 wr_id;
+ u32 idx;
+ int rc;
+
+ *wr_buf = NULL;
+ *wr_pend_priv = NULL;
+ if (in_softirq()) {
+ rc = smc_wr_tx_get_free_slot_index(link, &idx);
+ if (rc)
+ return rc;
+ } else {
+ rc = wait_event_interruptible_timeout(
+ link->wr_tx_wait,
+ (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
+ SMC_WR_TX_WAIT_FREE_SLOT_TIME);
+ if (!rc) {
+ /* timeout - terminate connections */
+ struct smc_link_group *lgr;
+
+ lgr = container_of(link, struct smc_link_group,
+ lnk[SMC_SINGLE_LINK]);
+ smc_lgr_terminate(lgr);
+ return -EPIPE;
+ }
+ if (rc == -ERESTARTSYS)
+ return -EINTR;
+ if (idx == link->wr_tx_cnt)
+ return -EPIPE;
+ }
+ wr_id = smc_wr_tx_get_next_wr_id(link);
+ wr_pend = &link->wr_tx_pends[idx];
+ wr_pend->wr_id = wr_id;
+ wr_pend->handler = handler;
+ wr_pend->link = link;
+ wr_pend->idx = idx;
+ wr_ib = &link->wr_tx_ibs[idx];
+ wr_ib->wr_id = wr_id;
+ *wr_buf = &link->wr_tx_bufs[idx];
+ *wr_pend_priv = &wr_pend->priv;
+ return 0;
+}
+
+int smc_wr_tx_put_slot(struct smc_link *link,
+ struct smc_wr_tx_pend_priv *wr_pend_priv)
+{
+ struct smc_wr_tx_pend *pend;
+
+ pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
+ if (pend->idx < link->wr_tx_cnt) {
+ /* clear the full struct smc_wr_tx_pend including .priv */
+ memset(&link->wr_tx_pends[pend->idx], 0,
+ sizeof(link->wr_tx_pends[pend->idx]));
+ memset(&link->wr_tx_bufs[pend->idx], 0,
+ sizeof(link->wr_tx_bufs[pend->idx]));
+ test_and_clear_bit(pend->idx, link->wr_tx_mask);
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Send prepared WR slot via ib_post_send.
+ * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
+ */
+int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
+{
+ struct ib_send_wr *failed_wr = NULL;
+ struct smc_wr_tx_pend *pend;
+ int rc;
+
+ ib_req_notify_cq(link->smcibdev->roce_cq_send,
+ IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS);
+ pend = container_of(priv, struct smc_wr_tx_pend, priv);
+ rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx],
+ &failed_wr);
+ if (rc)
+ smc_wr_tx_put_slot(link, priv);
+ return rc;
+}
+
+void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type,
+ smc_wr_tx_filter filter,
+ smc_wr_tx_dismisser dismisser,
+ unsigned long data)
+{
+ struct smc_wr_tx_pend_priv *tx_pend;
+ struct smc_wr_rx_hdr *wr_rx;
+ int i;
+
+ for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
+ wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i];
+ if (wr_rx->type != wr_rx_hdr_type)
+ continue;
+ tx_pend = &link->wr_tx_pends[i].priv;
+ if (filter(tx_pend, data))
+ dismisser(tx_pend);
+ }
+}
+
+bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type,
+ smc_wr_tx_filter filter, unsigned long data)
+{
+ struct smc_wr_tx_pend_priv *tx_pend;
+ struct smc_wr_rx_hdr *wr_rx;
+ int i;
+
+ for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
+ wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i];
+ if (wr_rx->type != wr_rx_hdr_type)
+ continue;
+ tx_pend = &link->wr_tx_pends[i].priv;
+ if (filter(tx_pend, data))
+ return true;
+ }
+ return false;
+}
+
+/****************************** receive queue ********************************/
+
+int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
+{
+ struct smc_wr_rx_handler *h_iter;
+ int rc = 0;
+
+ spin_lock(&smc_wr_rx_hash_lock);
+ hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
+ if (h_iter->type == handler->type) {
+ rc = -EEXIST;
+ goto out_unlock;
+ }
+ }
+ hash_add(smc_wr_rx_hash, &handler->list, handler->type);
+out_unlock:
+ spin_unlock(&smc_wr_rx_hash_lock);
+ return rc;
+}
+
+/* Demultiplex a received work request based on the message type to its handler.
+ * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
+ * and not being modified any more afterwards so we don't need to lock it.
+ */
+static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
+{
+ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+ struct smc_wr_rx_handler *handler;
+ struct smc_wr_rx_hdr *wr_rx;
+ u64 temp_wr_id;
+ u32 index;
+
+ if (wc->byte_len < sizeof(*wr_rx))
+ return; /* short message */
+ temp_wr_id = wc->wr_id;
+ index = do_div(temp_wr_id, link->wr_rx_cnt);
+ wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
+ hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
+ if (handler->type == wr_rx->type)
+ handler->handler(wc, wr_rx);
+ }
+}
+
+static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
+{
+ struct smc_link *link;
+ int i;
+
+ for (i = 0; i < num; i++) {
+ link = wc[i].qp->qp_context;
+ if (wc[i].status == IB_WC_SUCCESS) {
+ smc_wr_rx_demultiplex(&wc[i]);
+ smc_wr_rx_post(link); /* refill WR RX */
+ } else {
+ struct smc_link_group *lgr;
+
+ /* handle status errors */
+ switch (wc[i].status) {
+ case IB_WC_RETRY_EXC_ERR:
+ case IB_WC_RNR_RETRY_EXC_ERR:
+ case IB_WC_WR_FLUSH_ERR:
+ /* terminate connections of this link group
+ * abnormally
+ */
+ lgr = container_of(link, struct smc_link_group,
+ lnk[SMC_SINGLE_LINK]);
+ smc_lgr_terminate(lgr);
+ break;
+ default:
+ smc_wr_rx_post(link); /* refill WR RX */
+ break;
+ }
+ }
+ }
+}
+
+static void smc_wr_rx_tasklet_fn(unsigned long data)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)data;
+ struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
+ int polled = 0;
+ int rc;
+
+again:
+ polled++;
+ do {
+ memset(&wc, 0, sizeof(wc));
+ rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
+ if (polled == 1) {
+ ib_req_notify_cq(dev->roce_cq_recv,
+ IB_CQ_SOLICITED_MASK
+ | IB_CQ_REPORT_MISSED_EVENTS);
+ }
+ if (!rc)
+ break;
+ smc_wr_rx_process_cqes(&wc[0], rc);
+ } while (rc > 0);
+ if (polled == 1)
+ goto again;
+}
+
+void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
+
+ tasklet_schedule(&dev->recv_tasklet);
+}
+
+int smc_wr_rx_post_init(struct smc_link *link)
+{
+ u32 i;
+ int rc = 0;
+
+ for (i = 0; i < link->wr_rx_cnt; i++)
+ rc = smc_wr_rx_post(link);
+ return rc;
+}
+
+/***************************** init, exit, misc ******************************/
+
+void smc_wr_remember_qp_attr(struct smc_link *lnk)
+{
+ struct ib_qp_attr *attr = &lnk->qp_attr;
+ struct ib_qp_init_attr init_attr;
+
+ memset(attr, 0, sizeof(*attr));
+ memset(&init_attr, 0, sizeof(init_attr));
+ ib_query_qp(lnk->roce_qp, attr,
+ IB_QP_STATE |
+ IB_QP_CUR_STATE |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY |
+ IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_RQ_PSN |
+ IB_QP_ALT_PATH |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_SQ_PSN |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_CAP |
+ IB_QP_DEST_QPN,
+ &init_attr);
+
+ lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
+ lnk->qp_attr.cap.max_send_wr);
+ lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
+ lnk->qp_attr.cap.max_recv_wr);
+}
+
+static void smc_wr_init_sge(struct smc_link *lnk)
+{
+ u32 i;
+
+ for (i = 0; i < lnk->wr_tx_cnt; i++) {
+ lnk->wr_tx_sges[i].addr =
+ lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
+ lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
+ lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
+ lnk->wr_tx_ibs[i].next = NULL;
+ lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
+ lnk->wr_tx_ibs[i].num_sge = 1;
+ lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
+ lnk->wr_tx_ibs[i].send_flags =
+ IB_SEND_SIGNALED | IB_SEND_SOLICITED | IB_SEND_INLINE;
+ }
+ for (i = 0; i < lnk->wr_rx_cnt; i++) {
+ lnk->wr_rx_sges[i].addr =
+ lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
+ lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE;
+ lnk->wr_rx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
+ lnk->wr_rx_ibs[i].next = NULL;
+ lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i];
+ lnk->wr_rx_ibs[i].num_sge = 1;
+ }
+}
+
+void smc_wr_free_link(struct smc_link *lnk)
+{
+ struct ib_device *ibdev;
+
+ memset(lnk->wr_tx_mask, 0,
+ BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
+
+ if (!lnk->smcibdev)
+ return;
+ ibdev = lnk->smcibdev->ibdev;
+
+ if (lnk->wr_rx_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
+ SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ DMA_FROM_DEVICE);
+ lnk->wr_rx_dma_addr = 0;
+ }
+ if (lnk->wr_tx_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
+ SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
+ DMA_TO_DEVICE);
+ lnk->wr_tx_dma_addr = 0;
+ }
+}
+
+void smc_wr_free_link_mem(struct smc_link *lnk)
+{
+ kfree(lnk->wr_tx_pends);
+ lnk->wr_tx_pends = NULL;
+ kfree(lnk->wr_tx_mask);
+ lnk->wr_tx_mask = NULL;
+ kfree(lnk->wr_tx_sges);
+ lnk->wr_tx_sges = NULL;
+ kfree(lnk->wr_rx_sges);
+ lnk->wr_rx_sges = NULL;
+ kfree(lnk->wr_rx_ibs);
+ lnk->wr_rx_ibs = NULL;
+ kfree(lnk->wr_tx_ibs);
+ lnk->wr_tx_ibs = NULL;
+ kfree(lnk->wr_tx_bufs);
+ lnk->wr_tx_bufs = NULL;
+ kfree(lnk->wr_rx_bufs);
+ lnk->wr_rx_bufs = NULL;
+}
+
+int smc_wr_alloc_link_mem(struct smc_link *link)
+{
+ /* allocate link related memory */
+ link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
+ if (!link->wr_tx_bufs)
+ goto no_mem;
+ link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
+ GFP_KERNEL);
+ if (!link->wr_rx_bufs)
+ goto no_mem_wr_tx_bufs;
+ link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_ibs)
+ goto no_mem_wr_rx_bufs;
+ link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
+ sizeof(link->wr_rx_ibs[0]),
+ GFP_KERNEL);
+ if (!link->wr_rx_ibs)
+ goto no_mem_wr_tx_ibs;
+ link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_sges)
+ goto no_mem_wr_rx_ibs;
+ link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
+ sizeof(link->wr_rx_sges[0]),
+ GFP_KERNEL);
+ if (!link->wr_rx_sges)
+ goto no_mem_wr_tx_sges;
+ link->wr_tx_mask = kzalloc(
+ BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*link->wr_tx_mask),
+ GFP_KERNEL);
+ if (!link->wr_tx_mask)
+ goto no_mem_wr_rx_sges;
+ link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
+ sizeof(link->wr_tx_pends[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_pends)
+ goto no_mem_wr_tx_mask;
+ return 0;
+
+no_mem_wr_tx_mask:
+ kfree(link->wr_tx_mask);
+no_mem_wr_rx_sges:
+ kfree(link->wr_rx_sges);
+no_mem_wr_tx_sges:
+ kfree(link->wr_tx_sges);
+no_mem_wr_rx_ibs:
+ kfree(link->wr_rx_ibs);
+no_mem_wr_tx_ibs:
+ kfree(link->wr_tx_ibs);
+no_mem_wr_rx_bufs:
+ kfree(link->wr_rx_bufs);
+no_mem_wr_tx_bufs:
+ kfree(link->wr_tx_bufs);
+no_mem:
+ return -ENOMEM;
+}
+
+void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
+{
+ tasklet_kill(&smcibdev->recv_tasklet);
+ tasklet_kill(&smcibdev->send_tasklet);
+}
+
+void smc_wr_add_dev(struct smc_ib_device *smcibdev)
+{
+ tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn,
+ (unsigned long)smcibdev);
+ tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn,
+ (unsigned long)smcibdev);
+}
+
+int smc_wr_create_link(struct smc_link *lnk)
+{
+ struct ib_device *ibdev = lnk->smcibdev->ibdev;
+ int rc = 0;
+
+ smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
+ lnk->wr_rx_id = 0;
+ lnk->wr_rx_dma_addr = ib_dma_map_single(
+ ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
+ lnk->wr_rx_dma_addr = 0;
+ rc = -EIO;
+ goto out;
+ }
+ lnk->wr_tx_dma_addr = ib_dma_map_single(
+ ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
+ rc = -EIO;
+ goto dma_unmap;
+ }
+ smc_wr_init_sge(lnk);
+ memset(lnk->wr_tx_mask, 0,
+ BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
+ return rc;
+
+dma_unmap:
+ ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
+ SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ DMA_FROM_DEVICE);
+ lnk->wr_rx_dma_addr = 0;
+out:
+ return rc;
+}
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
new file mode 100644
index 000000000000..0b9beeda6053
--- /dev/null
+++ b/net/smc/smc_wr.h
@@ -0,0 +1,106 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Work Requests exploiting Infiniband API
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_WR_H
+#define SMC_WR_H
+
+#include <linux/atomic.h>
+#include <rdma/ib_verbs.h>
+#include <asm/div64.h>
+
+#include "smc.h"
+#include "smc_core.h"
+
+#define SMC_WR_MAX_CQE 32768 /* max. # of completion queue elements */
+#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */
+
+#define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ)
+#define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ)
+
+#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
+
+#define SMC_WR_TX_PEND_PRIV_SIZE 32
+
+struct smc_wr_tx_pend_priv {
+ u8 priv[SMC_WR_TX_PEND_PRIV_SIZE];
+};
+
+typedef void (*smc_wr_tx_handler)(struct smc_wr_tx_pend_priv *,
+ struct smc_link *,
+ enum ib_wc_status);
+
+typedef bool (*smc_wr_tx_filter)(struct smc_wr_tx_pend_priv *,
+ unsigned long);
+
+typedef void (*smc_wr_tx_dismisser)(struct smc_wr_tx_pend_priv *);
+
+struct smc_wr_rx_handler {
+ struct hlist_node list; /* hash table collision resolution */
+ void (*handler)(struct ib_wc *, void *);
+ u8 type;
+};
+
+/* Only used by RDMA write WRs.
+ * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly
+ */
+static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link)
+{
+ return atomic_long_inc_return(&link->wr_tx_id);
+}
+
+static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
+{
+ atomic_long_set(wr_tx_id, val);
+}
+
+/* post a new receive work request to fill a completed old work request entry */
+static inline int smc_wr_rx_post(struct smc_link *link)
+{
+ struct ib_recv_wr *bad_recv_wr = NULL;
+ int rc;
+ u64 wr_id, temp_wr_id;
+ u32 index;
+
+ wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */
+ temp_wr_id = wr_id;
+ index = do_div(temp_wr_id, link->wr_rx_cnt);
+ link->wr_rx_ibs[index].wr_id = wr_id;
+ rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], &bad_recv_wr);
+ return rc;
+}
+
+int smc_wr_create_link(struct smc_link *lnk);
+int smc_wr_alloc_link_mem(struct smc_link *lnk);
+void smc_wr_free_link(struct smc_link *lnk);
+void smc_wr_free_link_mem(struct smc_link *lnk);
+void smc_wr_remember_qp_attr(struct smc_link *lnk);
+void smc_wr_remove_dev(struct smc_ib_device *smcibdev);
+void smc_wr_add_dev(struct smc_ib_device *smcibdev);
+
+int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler,
+ struct smc_wr_buf **wr_buf,
+ struct smc_wr_tx_pend_priv **wr_pend_priv);
+int smc_wr_tx_put_slot(struct smc_link *link,
+ struct smc_wr_tx_pend_priv *wr_pend_priv);
+int smc_wr_tx_send(struct smc_link *link,
+ struct smc_wr_tx_pend_priv *wr_pend_priv);
+void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
+bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type,
+ smc_wr_tx_filter filter, unsigned long data);
+void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
+ smc_wr_tx_filter filter,
+ smc_wr_tx_dismisser dismisser,
+ unsigned long data);
+
+int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
+int smc_wr_rx_post_init(struct smc_link *link);
+void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
+
+#endif /* SMC_WR_H */
diff --git a/net/socket.c b/net/socket.c
index 73dc69f9681e..985ef06792d6 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -90,7 +90,7 @@
#include <linux/slab.h>
#include <linux/xattr.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <net/compat.h>
@@ -287,7 +287,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static void init_inodecache(void)
{
sock_inode_cachep = kmem_cache_create("sock_inode_cache",
sizeof(struct socket_alloc),
@@ -296,9 +296,7 @@ static int init_inodecache(void)
SLAB_RECLAIM_ACCOUNT |
SLAB_MEM_SPREAD | SLAB_ACCOUNT),
init_once);
- if (sock_inode_cachep == NULL)
- return -ENOMEM;
- return 0;
+ BUG_ON(sock_inode_cachep == NULL);
}
static const struct super_operations sockfs_ops = {
@@ -533,8 +531,22 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
return used;
}
+static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ int err = simple_setattr(dentry, iattr);
+
+ if (!err && (iattr->ia_valid & ATTR_UID)) {
+ struct socket *sock = SOCKET_I(d_inode(dentry));
+
+ sock->sk->sk_uid = iattr->ia_uid;
+ }
+
+ return err;
+}
+
static const struct inode_operations sockfs_inode_ops = {
.listxattr = sockfs_listxattr,
+ .setattr = sockfs_setattr,
};
/**
@@ -640,6 +652,16 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
}
EXPORT_SYMBOL(kernel_sendmsg);
+static bool skb_is_err_queue(const struct sk_buff *skb)
+{
+ /* pkt_type of skbs enqueued on the error queue are set to
+ * PACKET_OUTGOING in skb_set_err_queue(). This is only safe to do
+ * in recvmsg, since skbs received on a local socket will never
+ * have a pkt_type of PACKET_OUTGOING.
+ */
+ return skb->pkt_type == PACKET_OUTGOING;
+}
+
/*
* called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
*/
@@ -654,7 +676,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
/* Race occurred between timestamp enabling and packet
receiving. Fill in the current time for now. */
- if (need_software_tstamp && skb->tstamp.tv64 == 0)
+ if (need_software_tstamp && skb->tstamp == 0)
__net_timestamp(skb);
if (need_software_tstamp) {
@@ -679,9 +701,15 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
(sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2))
empty = 0;
- if (!empty)
+ if (!empty) {
put_cmsg(msg, SOL_SOCKET,
SCM_TIMESTAMPING, sizeof(tss), &tss);
+
+ if (skb_is_err_queue(skb) && skb->len &&
+ SKB_EXT_ERR(skb)->opt_stats)
+ put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS,
+ skb->len, skb->data);
+ }
}
EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
@@ -892,6 +920,11 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
* what to do with it - that's up to the protocol still.
*/
+static struct ns_common *get_net_ns(struct ns_common *ns)
+{
+ return &get_net(container_of(ns, struct net, ns))->ns;
+}
+
static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
struct socket *sock;
@@ -960,6 +993,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
err = dlci_ioctl_hook(cmd, argp);
mutex_unlock(&dlci_ioctl_mutex);
break;
+ case SIOCGSKNS:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+
+ err = open_related_ns(&net->ns, get_net_ns);
+ break;
default:
err = sock_do_ioctl(net, sock, cmd, arg);
break;
@@ -1477,7 +1517,7 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
if (err)
goto out_fd;
- err = sock->ops->accept(sock, newsock, sock->file->f_flags);
+ err = sock->ops->accept(sock, newsock, sock->file->f_flags, false);
if (err < 0)
goto out_fd;
@@ -1702,6 +1742,7 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
/* We assume all kernel code knows the size of sockaddr_storage */
msg.msg_namelen = 0;
msg.msg_iocb = NULL;
+ msg.msg_flags = 0;
if (sock->file->f_flags & O_NONBLOCK)
flags |= MSG_DONTWAIT;
err = sock_recvmsg(sock, &msg, flags);
@@ -1887,7 +1928,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
struct sockaddr_storage address;
struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
unsigned char ctl[sizeof(struct cmsghdr) + 20]
- __attribute__ ((aligned(sizeof(__kernel_size_t))));
+ __aligned(sizeof(__kernel_size_t));
/* 20 is size of ipv6_pktinfo */
unsigned char *ctl_buf = ctl;
int ctl_len;
@@ -1917,6 +1958,8 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
ctl_buf = msg_sys->msg_control;
ctl_len = msg_sys->msg_controllen;
} else if (ctl_len) {
+ BUILD_BUG_ON(sizeof(struct cmsghdr) !=
+ CMSG_ALIGN(sizeof(struct cmsghdr)));
if (ctl_len > sizeof(ctl)) {
ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
if (ctl_buf == NULL)
@@ -2197,8 +2240,10 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
return err;
err = sock_error(sock->sk);
- if (err)
+ if (err) {
+ datagrams = err;
goto out_put;
+ }
entry = mmsg;
compat_entry = (struct compat_mmsghdr __user *)mmsg;
@@ -3110,6 +3155,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
case SIOCSIFVLAN:
case SIOCADDDLCI:
case SIOCDELDLCI:
+ case SIOCGSKNS:
return sock_ioctl(file, cmd, arg);
case SIOCGIFFLAGS:
@@ -3204,7 +3250,7 @@ int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
if (err < 0)
goto done;
- err = sock->ops->accept(sock, *newsock, flags);
+ err = sock->ops->accept(sock, *newsock, flags, true);
if (err < 0) {
sock_release(*newsock);
*newsock = NULL;
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 41adf362936d..b5c279b22680 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -504,6 +504,7 @@ static int __init strp_mod_init(void)
static void __exit strp_mod_exit(void)
{
+ destroy_workqueue(strp_wq);
}
module_init(strp_mod_init);
module_exit(strp_mod_exit);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 2bff63a73cf8..d2623b9f23d6 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -8,6 +8,7 @@
#include <linux/types.h>
#include <linux/sched.h>
+#include <linux/cred.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/errno.h>
@@ -464,8 +465,10 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
* Note that the cred_unused list must be time-ordered.
*/
if (time_in_range(cred->cr_expire, expired, jiffies) &&
- test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0)
+ test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) {
+ freed = SHRINK_STOP;
break;
+ }
list_del_init(&cred->cr_lru);
number_cred_unused--;
@@ -520,7 +523,7 @@ static unsigned long
rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
- return (number_cred_unused / 100) * sysctl_vfs_cache_pressure;
+ return number_cred_unused * sysctl_vfs_cache_pressure / 100;
}
static void
@@ -646,9 +649,6 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
cred->cr_auth = auth;
cred->cr_ops = ops;
cred->cr_expire = jiffies;
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
- cred->cr_magic = RPCAUTH_CRED_MAGIC;
-#endif
cred->cr_uid = acred->uid;
}
EXPORT_SYMBOL_GPL(rpcauth_init_cred);
@@ -876,8 +876,12 @@ int __init rpcauth_init_module(void)
err = rpc_init_generic_auth();
if (err < 0)
goto out2;
- register_shrinker(&rpc_cred_shrinker);
+ err = register_shrinker(&rpc_cred_shrinker);
+ if (err < 0)
+ goto out3;
return 0;
+out3:
+ rpc_destroy_generic_auth();
out2:
rpc_destroy_authunix();
out1:
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 3dfd769dc5b5..4f16953e4954 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -50,7 +50,7 @@
#include <linux/workqueue.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
#include <linux/sunrpc/gss_api.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/hashtable.h>
#include "../netns.h"
@@ -541,9 +541,13 @@ gss_setup_upcall(struct gss_auth *gss_auth, struct rpc_cred *cred)
return gss_new;
gss_msg = gss_add_msg(gss_new);
if (gss_msg == gss_new) {
- int res = rpc_queue_upcall(gss_new->pipe, &gss_new->msg);
+ int res;
+ atomic_inc(&gss_msg->count);
+ res = rpc_queue_upcall(gss_new->pipe, &gss_new->msg);
if (res) {
gss_unhash_msg(gss_new);
+ atomic_dec(&gss_msg->count);
+ gss_release_msg(gss_new);
gss_msg = ERR_PTR(res);
}
} else
@@ -759,7 +763,7 @@ err_put_ctx:
err:
kfree(buf);
out:
- dprintk("RPC: %s returning %Zd\n", __func__, err);
+ dprintk("RPC: %s returning %zd\n", __func__, err);
return err;
}
@@ -836,6 +840,7 @@ gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
warn_gssd();
gss_release_msg(gss_msg);
}
+ gss_release_msg(gss_msg);
}
static void gss_pipe_dentry_destroy(struct dentry *dir,
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 90115ceefd49..fb39284ec174 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -200,7 +200,7 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
if (IS_ERR(hmac_md5))
goto out_free_md5;
- req = ahash_request_alloc(md5, GFP_KERNEL);
+ req = ahash_request_alloc(md5, GFP_NOFS);
if (!req)
goto out_free_hmac_md5;
@@ -230,7 +230,7 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
goto out;
ahash_request_free(req);
- req = ahash_request_alloc(hmac_md5, GFP_KERNEL);
+ req = ahash_request_alloc(hmac_md5, GFP_NOFS);
if (!req)
goto out_free_hmac_md5;
@@ -299,7 +299,7 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
if (IS_ERR(tfm))
goto out_free_cksum;
- req = ahash_request_alloc(tfm, GFP_KERNEL);
+ req = ahash_request_alloc(tfm, GFP_NOFS);
if (!req)
goto out_free_ahash;
@@ -397,7 +397,7 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
goto out_free_cksum;
checksumlen = crypto_ahash_digestsize(tfm);
- req = ahash_request_alloc(tfm, GFP_KERNEL);
+ req = ahash_request_alloc(tfm, GFP_NOFS);
if (!req)
goto out_free_ahash;
@@ -963,7 +963,7 @@ krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
}
desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
- GFP_KERNEL);
+ GFP_NOFS);
if (!desc) {
dprintk("%s: failed to allocate shash descriptor for '%s'\n",
__func__, kctx->gk5e->cksum_name);
@@ -1030,7 +1030,7 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
}
desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
- GFP_KERNEL);
+ GFP_NOFS);
if (!desc) {
dprintk("%s: failed to allocate shash descriptor for '%s'\n",
__func__, kctx->gk5e->cksum_name);
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 60595835317a..7bb2514aadd9 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -451,8 +451,7 @@ context_derive_keys_rc4(struct krb5_ctx *ctx)
goto out_err_free_hmac;
- desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
- GFP_KERNEL);
+ desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), GFP_NOFS);
if (!desc) {
dprintk("%s: failed to allocate hash descriptor for '%s'\n",
__func__, ctx->gk5e->cksum_name);
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index dc6fb79a361f..25d9a9cf7b66 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -260,7 +260,7 @@ static int gssx_dec_option_array(struct xdr_stream *xdr,
if (!oa->data)
return -ENOMEM;
- creds = kmalloc(sizeof(struct svc_cred), GFP_KERNEL);
+ creds = kzalloc(sizeof(struct svc_cred), GFP_KERNEL);
if (!creds) {
kfree(oa->data);
return -ENOMEM;
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 45662d7f0943..a54a7a3d28f5 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1489,8 +1489,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
case RPC_GSS_PROC_DESTROY:
if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
goto auth_err;
- rsci->h.expiry_time = get_seconds();
- set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+ /* Delete the entry from the cache_list and call cache_put */
+ sunrpc_cache_unhash(sn->rsc_cache, &rsci->h);
if (resv->iov_len + 4 > PAGE_SIZE)
goto drop;
svc_putnl(resv, RPC_SUCCESS);
@@ -1548,7 +1548,7 @@ complete:
ret = SVC_COMPLETE;
goto out;
drop:
- ret = SVC_DROP;
+ ret = SVC_CLOSE;
out:
if (rsci)
cache_put(&rsci->h, sn->rsc_cache);
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 4d17376b2acb..5f3d527dff65 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -139,7 +139,4 @@ struct rpc_cred null_cred = {
.cr_ops = &null_credops,
.cr_count = ATOMIC_INIT(1),
.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE,
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
- .cr_magic = RPCAUTH_CRED_MAGIC,
-#endif
};
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 306fc0f54596..82337e1ec9cd 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -14,12 +14,10 @@
#include <linux/sunrpc/auth.h>
#include <linux/user_namespace.h>
-#define NFS_NGROUPS 16
-
struct unx_cred {
struct rpc_cred uc_base;
kgid_t uc_gid;
- kgid_t uc_gids[NFS_NGROUPS];
+ kgid_t uc_gids[UNX_NGROUPS];
};
#define uc_uid uc_base.cr_uid
@@ -82,13 +80,13 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t
if (acred->group_info != NULL)
groups = acred->group_info->ngroups;
- if (groups > NFS_NGROUPS)
- groups = NFS_NGROUPS;
+ if (groups > UNX_NGROUPS)
+ groups = UNX_NGROUPS;
cred->uc_gid = acred->gid;
for (i = 0; i < groups; i++)
cred->uc_gids[i] = acred->group_info->gid[i];
- if (i < NFS_NGROUPS)
+ if (i < UNX_NGROUPS)
cred->uc_gids[i] = INVALID_GID;
return &cred->uc_base;
@@ -132,12 +130,12 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags)
if (acred->group_info != NULL)
groups = acred->group_info->ngroups;
- if (groups > NFS_NGROUPS)
- groups = NFS_NGROUPS;
+ if (groups > UNX_NGROUPS)
+ groups = UNX_NGROUPS;
for (i = 0; i < groups ; i++)
if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i]))
return 0;
- if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups]))
+ if (groups < UNX_NGROUPS && gid_valid(cred->uc_gids[groups]))
return 0;
return 1;
}
@@ -166,7 +164,7 @@ unx_marshal(struct rpc_task *task, __be32 *p)
*p++ = htonl((u32) from_kuid(&init_user_ns, cred->uc_uid));
*p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gid));
hold = p++;
- for (i = 0; i < 16 && gid_valid(cred->uc_gids[i]); i++)
+ for (i = 0; i < UNX_NGROUPS && gid_valid(cred->uc_gids[i]); i++)
*p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gids[i]));
*hold = htonl(p - hold - 1); /* gid array length */
*base = htonl((p - base - 1) << 2); /* cred length */
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 8aabe12201f8..79d55d949d9a 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -21,7 +21,7 @@
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/string_helpers.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/poll.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
@@ -362,11 +362,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
cache_purge(cd);
spin_lock(&cache_list_lock);
write_lock(&cd->hash_lock);
- if (cd->entries) {
- write_unlock(&cd->hash_lock);
- spin_unlock(&cache_list_lock);
- goto out;
- }
if (current_detail == cd)
current_detail = NULL;
list_del_init(&cd->others);
@@ -376,9 +371,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
/* module must be being unloaded so its safe to kill the worker */
cancel_delayed_work_sync(&cache_cleaner);
}
- return;
-out:
- printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name);
}
EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail);
@@ -497,13 +489,32 @@ EXPORT_SYMBOL_GPL(cache_flush);
void cache_purge(struct cache_detail *detail)
{
- time_t now = seconds_since_boot();
- if (detail->flush_time >= now)
- now = detail->flush_time + 1;
- /* 'now' is the maximum value any 'last_refresh' can have */
- detail->flush_time = now;
- detail->nextcheck = seconds_since_boot();
- cache_flush();
+ struct cache_head *ch = NULL;
+ struct hlist_head *head = NULL;
+ struct hlist_node *tmp = NULL;
+ int i = 0;
+
+ write_lock(&detail->hash_lock);
+ if (!detail->entries) {
+ write_unlock(&detail->hash_lock);
+ return;
+ }
+
+ dprintk("RPC: %d entries in %s cache\n", detail->entries, detail->name);
+ for (i = 0; i < detail->hash_size; i++) {
+ head = &detail->hash_table[i];
+ hlist_for_each_entry_safe(ch, tmp, head, cache_list) {
+ hlist_del_init(&ch->cache_list);
+ detail->entries--;
+
+ set_bit(CACHE_CLEANED, &ch->flags);
+ write_unlock(&detail->hash_lock);
+ cache_fresh_unlocked(ch, detail);
+ cache_put(ch, detail);
+ write_lock(&detail->hash_lock);
+ }
+ }
+ write_unlock(&detail->hash_lock);
}
EXPORT_SYMBOL_GPL(cache_purge);
@@ -717,7 +728,7 @@ void cache_clean_deferred(void *owner)
/*
* communicate with user-space
*
- * We have a magic /proc file - /proc/sunrpc/<cachename>/channel.
+ * We have a magic /proc file - /proc/net/rpc/<cachename>/channel.
* On read, you get a full request, or block.
* On write, an update request is processed.
* Poll works if anything to read, and always allows write.
@@ -1272,7 +1283,7 @@ EXPORT_SYMBOL_GPL(qword_get);
/*
- * support /proc/sunrpc/cache/$CACHENAME/content
+ * support /proc/net/rpc/$CACHENAME/content
* as a seqfile.
* We call ->cache_show passing NULL for the item to
* get a header, then pass each real item in the cache
@@ -1358,7 +1369,7 @@ static int c_show(struct seq_file *m, void *p)
ifdebug(CACHE)
seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
convert_to_wallclock(cp->expiry_time),
- atomic_read(&cp->ref.refcount), cp->flags);
+ kref_read(&cp->ref), cp->flags);
cache_get(cp);
if (cache_check(cd, cp, NULL))
/* cache_check does a cache_put on failure */
@@ -1427,20 +1438,11 @@ static ssize_t read_flush(struct file *file, char __user *buf,
struct cache_detail *cd)
{
char tbuf[22];
- unsigned long p = *ppos;
size_t len;
- snprintf(tbuf, sizeof(tbuf), "%lu\n", convert_to_wallclock(cd->flush_time));
- len = strlen(tbuf);
- if (p >= len)
- return 0;
- len -= p;
- if (len > count)
- len = count;
- if (copy_to_user(buf, (void*)(tbuf+p), len))
- return -EFAULT;
- *ppos += len;
- return len;
+ len = snprintf(tbuf, sizeof(tbuf), "%lu\n",
+ convert_to_wallclock(cd->flush_time));
+ return simple_read_from_buffer(buf, count, ppos, tbuf, len);
}
static ssize_t write_flush(struct file *file, const char __user *buf,
@@ -1600,21 +1602,12 @@ static const struct file_operations cache_flush_operations_procfs = {
.llseek = no_llseek,
};
-static void remove_cache_proc_entries(struct cache_detail *cd, struct net *net)
+static void remove_cache_proc_entries(struct cache_detail *cd)
{
- struct sunrpc_net *sn;
-
- if (cd->u.procfs.proc_ent == NULL)
- return;
- if (cd->u.procfs.flush_ent)
- remove_proc_entry("flush", cd->u.procfs.proc_ent);
- if (cd->u.procfs.channel_ent)
- remove_proc_entry("channel", cd->u.procfs.proc_ent);
- if (cd->u.procfs.content_ent)
- remove_proc_entry("content", cd->u.procfs.proc_ent);
- cd->u.procfs.proc_ent = NULL;
- sn = net_generic(net, sunrpc_net_id);
- remove_proc_entry(cd->name, sn->proc_net_rpc);
+ if (cd->procfs) {
+ proc_remove(cd->procfs);
+ cd->procfs = NULL;
+ }
}
#ifdef CONFIG_PROC_FS
@@ -1624,38 +1617,30 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
struct sunrpc_net *sn;
sn = net_generic(net, sunrpc_net_id);
- cd->u.procfs.proc_ent = proc_mkdir(cd->name, sn->proc_net_rpc);
- if (cd->u.procfs.proc_ent == NULL)
+ cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc);
+ if (cd->procfs == NULL)
goto out_nomem;
- cd->u.procfs.channel_ent = NULL;
- cd->u.procfs.content_ent = NULL;
p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR,
- cd->u.procfs.proc_ent,
- &cache_flush_operations_procfs, cd);
- cd->u.procfs.flush_ent = p;
+ cd->procfs, &cache_flush_operations_procfs, cd);
if (p == NULL)
goto out_nomem;
if (cd->cache_request || cd->cache_parse) {
p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR,
- cd->u.procfs.proc_ent,
- &cache_file_operations_procfs, cd);
- cd->u.procfs.channel_ent = p;
+ cd->procfs, &cache_file_operations_procfs, cd);
if (p == NULL)
goto out_nomem;
}
if (cd->cache_show) {
p = proc_create_data("content", S_IFREG|S_IRUSR,
- cd->u.procfs.proc_ent,
- &content_file_operations_procfs, cd);
- cd->u.procfs.content_ent = p;
+ cd->procfs, &content_file_operations_procfs, cd);
if (p == NULL)
goto out_nomem;
}
return 0;
out_nomem:
- remove_cache_proc_entries(cd, net);
+ remove_cache_proc_entries(cd);
return -ENOMEM;
}
#else /* CONFIG_PROC_FS */
@@ -1684,7 +1669,7 @@ EXPORT_SYMBOL_GPL(cache_register_net);
void cache_unregister_net(struct cache_detail *cd, struct net *net)
{
- remove_cache_proc_entries(cd, net);
+ remove_cache_proc_entries(cd);
sunrpc_destroy_cache_detail(cd);
}
EXPORT_SYMBOL_GPL(cache_unregister_net);
@@ -1843,15 +1828,29 @@ int sunrpc_cache_register_pipefs(struct dentry *parent,
struct dentry *dir = rpc_create_cache_dir(parent, name, umode, cd);
if (IS_ERR(dir))
return PTR_ERR(dir);
- cd->u.pipefs.dir = dir;
+ cd->pipefs = dir;
return 0;
}
EXPORT_SYMBOL_GPL(sunrpc_cache_register_pipefs);
void sunrpc_cache_unregister_pipefs(struct cache_detail *cd)
{
- rpc_remove_cache_dir(cd->u.pipefs.dir);
- cd->u.pipefs.dir = NULL;
+ if (cd->pipefs) {
+ rpc_remove_cache_dir(cd->pipefs);
+ cd->pipefs = NULL;
+ }
}
EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs);
+void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h)
+{
+ write_lock(&cd->hash_lock);
+ if (!hlist_unhashed(&h->cache_list)){
+ hlist_del_init(&h->cache_list);
+ cd->entries--;
+ write_unlock(&cd->hash_lock);
+ cache_put(h, cd);
+ } else
+ write_unlock(&cd->hash_lock);
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_unhash);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 62a482790937..52da3ce54bb5 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -336,6 +336,11 @@ out:
static DEFINE_IDA(rpc_clids);
+void rpc_cleanup_clids(void)
+{
+ ida_destroy(&rpc_clids);
+}
+
static int rpc_alloc_clid(struct rpc_clnt *clnt)
{
int clid;
@@ -1448,21 +1453,6 @@ size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
/**
- * rpc_get_timeout - Get timeout for transport in units of HZ
- * @clnt: RPC client to query
- */
-unsigned long rpc_get_timeout(struct rpc_clnt *clnt)
-{
- unsigned long ret;
-
- rcu_read_lock();
- ret = rcu_dereference(clnt->cl_xprt)->timeout->to_initval;
- rcu_read_unlock();
- return ret;
-}
-EXPORT_SYMBOL_GPL(rpc_get_timeout);
-
-/**
* rpc_force_rebind - force transport to check that remote port is unchanged
* @clnt: client to rebind
*
@@ -1926,6 +1916,8 @@ call_connect_status(struct rpc_task *task)
case -EADDRINUSE:
case -ENOBUFS:
case -EPIPE:
+ xprt_conditional_disconnect(task->tk_rqstp->rq_xprt,
+ task->tk_rqstp->rq_connect_cookie);
if (RPC_IS_SOFTCONN(task))
break;
/* retry with existing socket, after a delay */
@@ -2692,6 +2684,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
{
struct rpc_xprt_switch *xps;
struct rpc_xprt *xprt;
+ unsigned long connect_timeout;
unsigned long reconnect_timeout;
unsigned char resvport;
int ret = 0;
@@ -2704,6 +2697,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
return -EAGAIN;
}
resvport = xprt->resvport;
+ connect_timeout = xprt->connect_timeout;
reconnect_timeout = xprt->max_reconnect_timeout;
rcu_read_unlock();
@@ -2713,7 +2707,10 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
goto out_put_switch;
}
xprt->resvport = resvport;
- xprt->max_reconnect_timeout = reconnect_timeout;
+ if (xprt->ops->set_connect_timeout != NULL)
+ xprt->ops->set_connect_timeout(xprt,
+ connect_timeout,
+ reconnect_timeout);
rpc_xprt_switch_set_roundrobin(xps);
if (setup) {
@@ -2730,26 +2727,39 @@ out_put_switch:
}
EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt);
+struct connect_timeout_data {
+ unsigned long connect_timeout;
+ unsigned long reconnect_timeout;
+};
+
static int
-rpc_xprt_cap_max_reconnect_timeout(struct rpc_clnt *clnt,
+rpc_xprt_set_connect_timeout(struct rpc_clnt *clnt,
struct rpc_xprt *xprt,
void *data)
{
- unsigned long timeout = *((unsigned long *)data);
+ struct connect_timeout_data *timeo = data;
- if (timeout < xprt->max_reconnect_timeout)
- xprt->max_reconnect_timeout = timeout;
+ if (xprt->ops->set_connect_timeout)
+ xprt->ops->set_connect_timeout(xprt,
+ timeo->connect_timeout,
+ timeo->reconnect_timeout);
return 0;
}
void
-rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo)
+rpc_set_connect_timeout(struct rpc_clnt *clnt,
+ unsigned long connect_timeout,
+ unsigned long reconnect_timeout)
{
+ struct connect_timeout_data timeout = {
+ .connect_timeout = connect_timeout,
+ .reconnect_timeout = reconnect_timeout,
+ };
rpc_clnt_iterate_for_each_xprt(clnt,
- rpc_xprt_cap_max_reconnect_timeout,
- &timeo);
+ rpc_xprt_set_connect_timeout,
+ &timeout);
}
-EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout);
+EXPORT_SYMBOL_GPL(rpc_set_connect_timeout);
void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt)
{
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index e7b4d93566df..c8fd0b6c1618 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -16,11 +16,6 @@ static struct dentry *rpc_xprt_dir;
unsigned int rpc_inject_disconnect;
-struct rpc_clnt_iter {
- struct rpc_clnt *clnt;
- loff_t pos;
-};
-
static int
tasks_show(struct seq_file *f, void *v)
{
@@ -47,12 +42,10 @@ static void *
tasks_start(struct seq_file *f, loff_t *ppos)
__acquires(&clnt->cl_lock)
{
- struct rpc_clnt_iter *iter = f->private;
+ struct rpc_clnt *clnt = f->private;
loff_t pos = *ppos;
- struct rpc_clnt *clnt = iter->clnt;
struct rpc_task *task;
- iter->pos = pos + 1;
spin_lock(&clnt->cl_lock);
list_for_each_entry(task, &clnt->cl_tasks, tk_task)
if (pos-- == 0)
@@ -63,12 +56,10 @@ tasks_start(struct seq_file *f, loff_t *ppos)
static void *
tasks_next(struct seq_file *f, void *v, loff_t *pos)
{
- struct rpc_clnt_iter *iter = f->private;
- struct rpc_clnt *clnt = iter->clnt;
+ struct rpc_clnt *clnt = f->private;
struct rpc_task *task = v;
struct list_head *next = task->tk_task.next;
- ++iter->pos;
++*pos;
/* If there's another task on list, return it */
@@ -81,9 +72,7 @@ static void
tasks_stop(struct seq_file *f, void *v)
__releases(&clnt->cl_lock)
{
- struct rpc_clnt_iter *iter = f->private;
- struct rpc_clnt *clnt = iter->clnt;
-
+ struct rpc_clnt *clnt = f->private;
spin_unlock(&clnt->cl_lock);
}
@@ -96,17 +85,13 @@ static const struct seq_operations tasks_seq_operations = {
static int tasks_open(struct inode *inode, struct file *filp)
{
- int ret = seq_open_private(filp, &tasks_seq_operations,
- sizeof(struct rpc_clnt_iter));
-
+ int ret = seq_open(filp, &tasks_seq_operations);
if (!ret) {
struct seq_file *seq = filp->private_data;
- struct rpc_clnt_iter *iter = seq->private;
-
- iter->clnt = inode->i_private;
+ struct rpc_clnt *clnt = seq->private = inode->i_private;
- if (!atomic_inc_not_zero(&iter->clnt->cl_count)) {
- seq_release_private(inode, filp);
+ if (!atomic_inc_not_zero(&clnt->cl_count)) {
+ seq_release(inode, filp);
ret = -EINVAL;
}
}
@@ -118,10 +103,10 @@ static int
tasks_release(struct inode *inode, struct file *filp)
{
struct seq_file *seq = filp->private_data;
- struct rpc_clnt_iter *iter = seq->private;
+ struct rpc_clnt *clnt = seq->private;
- rpc_release_client(iter->clnt);
- return seq_release_private(inode, filp);
+ rpc_release_client(clnt);
+ return seq_release(inode, filp);
}
static const struct file_operations tasks_fops = {
diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h
index df5826876535..394ce523174c 100644
--- a/net/sunrpc/netns.h
+++ b/net/sunrpc/netns.h
@@ -34,7 +34,7 @@ struct sunrpc_net {
struct proc_dir_entry *use_gssp_proc;
};
-extern int sunrpc_net_id;
+extern unsigned int sunrpc_net_id;
int ip_map_cache_create(struct net *);
void ip_map_cache_destroy(struct net *);
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 2ecb994314c1..caeb01ad2b5a 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -157,15 +157,17 @@ void rpc_count_iostats_metrics(const struct rpc_task *task,
spin_lock(&op_metrics->om_lock);
op_metrics->om_ops++;
- op_metrics->om_ntrans += req->rq_ntrans;
+ /* kernel API: om_ops must never become larger than om_ntrans */
+ op_metrics->om_ntrans += max(req->rq_ntrans, 1);
op_metrics->om_timeouts += task->tk_timeouts;
op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent;
op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd;
- delta = ktime_sub(req->rq_xtime, task->tk_start);
- op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta);
-
+ if (ktime_to_ns(req->rq_xtime)) {
+ delta = ktime_sub(req->rq_xtime, task->tk_start);
+ op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta);
+ }
op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt);
delta = ktime_sub(now, task->tk_start);
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index ee5d3d253102..c73de181467a 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -24,7 +24,7 @@
#include "netns.h"
-int sunrpc_net_id;
+unsigned int sunrpc_net_id;
EXPORT_SYMBOL_GPL(sunrpc_net_id);
static __net_init int sunrpc_init_net(struct net *net)
@@ -119,6 +119,7 @@ out:
static void __exit
cleanup_sunrpc(void)
{
+ rpc_cleanup_clids();
rpcauth_remove_module();
cleanup_socket_xprt();
svc_cleanup_xprt_sock();
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 7c8070ec93c8..a08aeb56b8e4 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -11,7 +11,7 @@
*/
#include <linux/linkage.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/errno.h>
#include <linux/net.h>
#include <linux/in.h>
@@ -385,7 +385,7 @@ static int svc_uses_rpcbind(struct svc_serv *serv)
for (i = 0; i < progp->pg_nvers; i++) {
if (progp->pg_vers[i] == NULL)
continue;
- if (progp->pg_vers[i]->vs_hidden == 0)
+ if (!progp->pg_vers[i]->vs_hidden)
return 1;
}
}
@@ -976,6 +976,13 @@ int svc_register(const struct svc_serv *serv, struct net *net,
if (vers->vs_hidden)
continue;
+ /*
+ * Don't register a UDP port if we need congestion
+ * control.
+ */
+ if (vers->vs_need_cong_ctrl && proto == IPPROTO_UDP)
+ continue;
+
error = __svc_register(net, progp->pg_name, progp->pg_prog,
i, family, proto, port);
@@ -1155,8 +1162,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
case SVC_DENIED:
goto err_bad_auth;
case SVC_CLOSE:
- if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
- svc_close_xprt(rqstp->rq_xprt);
+ goto close;
case SVC_DROP:
goto dropit;
case SVC_COMPLETE:
@@ -1170,6 +1176,21 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
!(versp = progp->pg_vers[vers]))
goto err_bad_vers;
+ /*
+ * Some protocol versions (namely NFSv4) require some form of
+ * congestion control. (See RFC 7530 section 3.1 paragraph 2)
+ * In other words, UDP is not allowed. We mark those when setting
+ * up the svc_xprt, and verify that here.
+ *
+ * The spec is not very clear about what error should be returned
+ * when someone tries to access a server that is listening on UDP
+ * for lower versions. RPC_PROG_MISMATCH seems to be the closest
+ * fit.
+ */
+ if (versp->vs_need_cong_ctrl &&
+ !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags))
+ goto err_bad_vers;
+
procp = versp->vs_proc + proc;
if (proc >= versp->vs_nproc || !procp->pc_func)
goto err_bad_proc;
@@ -1246,7 +1267,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
sendit:
if (svc_authorise(rqstp))
- goto dropit;
+ goto close;
return 1; /* Caller can now send it */
dropit:
@@ -1254,11 +1275,16 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
dprintk("svc: svc_process dropit\n");
return 0;
+ close:
+ if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
+ svc_close_xprt(rqstp->rq_xprt);
+ dprintk("svc: svc_process close\n");
+ return 0;
+
err_short_len:
- svc_printk(rqstp, "short len %Zd, dropping request\n",
+ svc_printk(rqstp, "short len %zd, dropping request\n",
argv->iov_len);
-
- goto dropit; /* drop request */
+ goto close;
err_bad_rpc:
serv->sv_stats->rpcbadfmt++;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 3bc1d61694cb..7bfe1fb42add 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -490,7 +490,7 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
svc_xprt_get(xprt);
dprintk("svc: transport %p dequeued, inuse=%d\n",
- xprt, atomic_read(&xprt->xpt_ref.refcount));
+ xprt, kref_read(&xprt->xpt_ref));
}
spin_unlock_bh(&pool->sp_lock);
out:
@@ -799,6 +799,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) {
dprintk("svc_recv: found XPT_CLOSE\n");
+ if (test_and_clear_bit(XPT_KILL_TEMP, &xprt->xpt_flags))
+ xprt->xpt_ops->xpo_kill_temp_xprt(xprt);
svc_delete_xprt(xprt);
/* Leave XPT_BUSY set on the dead xprt: */
goto out;
@@ -820,7 +822,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
/* XPT_DATA|XPT_DEFERRED case: */
dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
rqstp, rqstp->rq_pool->sp_id, xprt,
- atomic_read(&xprt->xpt_ref.refcount));
+ kref_read(&xprt->xpt_ref));
rqstp->rq_deferred = svc_deferred_dequeue(xprt);
if (rqstp->rq_deferred)
len = svc_deferred_recv(rqstp);
@@ -978,7 +980,7 @@ static void svc_age_temp_xprts(unsigned long closure)
* through, close it. */
if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
continue;
- if (atomic_read(&xprt->xpt_ref.refcount) > 1 ||
+ if (kref_read(&xprt->xpt_ref) > 1 ||
test_bit(XPT_BUSY, &xprt->xpt_flags))
continue;
list_del_init(le);
@@ -1020,9 +1022,11 @@ void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr)
le = to_be_closed.next;
list_del_init(le);
xprt = list_entry(le, struct svc_xprt, xpt_list);
- dprintk("svc_age_temp_xprts_now: closing %p\n", xprt);
- xprt->xpt_ops->xpo_kill_temp_xprt(xprt);
- svc_close_xprt(xprt);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ set_bit(XPT_KILL_TEMP, &xprt->xpt_flags);
+ dprintk("svc_age_temp_xprts_now: queuing xprt %p for closing\n",
+ xprt);
+ svc_xprt_enqueue(xprt);
}
}
EXPORT_SYMBOL_GPL(svc_age_temp_xprts_now);
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index 69841db1f533..bb8db3cb8032 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -124,16 +124,20 @@ EXPORT_SYMBOL_GPL(svc_auth_unregister);
#define DN_HASHMAX (1<<DN_HASHBITS)
static struct hlist_head auth_domain_table[DN_HASHMAX];
-static spinlock_t auth_domain_lock =
- __SPIN_LOCK_UNLOCKED(auth_domain_lock);
+static DEFINE_SPINLOCK(auth_domain_lock);
+
+static void auth_domain_release(struct kref *kref)
+{
+ struct auth_domain *dom = container_of(kref, struct auth_domain, ref);
+
+ hlist_del(&dom->hash);
+ dom->flavour->domain_release(dom);
+ spin_unlock(&auth_domain_lock);
+}
void auth_domain_put(struct auth_domain *dom)
{
- if (atomic_dec_and_lock(&dom->ref.refcount, &auth_domain_lock)) {
- hlist_del(&dom->hash);
- dom->flavour->domain_release(dom);
- spin_unlock(&auth_domain_lock);
- }
+ kref_put_lock(&dom->ref, auth_domain_release, &auth_domain_lock);
}
EXPORT_SYMBOL_GPL(auth_domain_put);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 64af4f034de6..f81eaa8e0888 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -403,7 +403,7 @@ svcauth_unix_info_release(struct svc_xprt *xpt)
/****************************************************************************
* auth.unix.gid cache
* simple cache to map a UID to a list of GIDs
- * because AUTH_UNIX aka AUTH_SYS has a max of 16
+ * because AUTH_UNIX aka AUTH_SYS has a max of UNX_NGROUPS
*/
#define GID_HASHBITS 8
#define GID_HASHMAX (1<<GID_HASHBITS)
@@ -810,7 +810,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
cred->cr_uid = make_kuid(&init_user_ns, svc_getnl(argv)); /* uid */
cred->cr_gid = make_kgid(&init_user_ns, svc_getnl(argv)); /* gid */
slen = svc_getnl(argv); /* gids length */
- if (slen > 16 || (len -= (slen + 2)*4) < 0)
+ if (slen > UNX_NGROUPS || (len -= (slen + 2)*4) < 0)
goto badcred;
cred->cr_group_info = groups_alloc(slen);
if (cred->cr_group_info == NULL)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index a4bc98265d88..2b720fa35c4f 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -39,9 +39,10 @@
#include <net/checksum.h>
#include <net/ip.h>
#include <net/ipv6.h>
+#include <net/udp.h>
#include <net/tcp.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <trace/events/skb.h>
@@ -129,6 +130,18 @@ static void svc_release_skb(struct svc_rqst *rqstp)
}
}
+static void svc_release_udp_skb(struct svc_rqst *rqstp)
+{
+ struct sk_buff *skb = rqstp->rq_xprt_ctxt;
+
+ if (skb) {
+ rqstp->rq_xprt_ctxt = NULL;
+
+ dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
+ consume_skb(skb);
+ }
+}
+
union svc_pktinfo_u {
struct in_pktinfo pkti;
struct in6_pktinfo pkti6;
@@ -265,7 +278,7 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
rqstp->rq_respages[0], tailoff);
out:
- dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
+ dprintk("svc: socket %p sendto([%p %zu... ], %d) = %d (addr %s)\n",
svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
@@ -333,7 +346,7 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
if (len == buflen)
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
+ dprintk("svc: socket %p recvfrom(%p, %zu) = %d\n",
svsk, iov[0].iov_base, iov[0].iov_len, len);
return len;
}
@@ -549,7 +562,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
0, 0, MSG_PEEK | MSG_DONTWAIT);
if (err >= 0)
- skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err);
+ skb = skb_recv_udp(svsk->sk_sk, 0, 1, &err);
if (skb == NULL) {
if (err != -EAGAIN) {
@@ -561,7 +574,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
}
len = svc_addr_len(svc_addr(rqstp));
rqstp->rq_addrlen = len;
- if (skb->tstamp.tv64 == 0) {
+ if (skb->tstamp == 0) {
skb->tstamp = ktime_get_real();
/* Don't enable netstamp, sunrpc doesn't
need that much accuracy */
@@ -590,7 +603,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
goto out_free;
}
local_bh_enable();
- skb_free_datagram_locked(svsk->sk_sk, skb);
+ consume_skb(skb);
} else {
/* we can use it in-place */
rqstp->rq_arg.head[0].iov_base = skb->data;
@@ -617,8 +630,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
return len;
out_free:
- trace_kfree_skb(skb, svc_udp_recvfrom);
- skb_free_datagram_locked(svsk->sk_sk, skb);
+ kfree_skb(skb);
return 0;
}
@@ -679,7 +691,7 @@ static struct svc_xprt_ops svc_udp_ops = {
.xpo_create = svc_udp_create,
.xpo_recvfrom = svc_udp_recvfrom,
.xpo_sendto = svc_udp_sendto,
- .xpo_release_rqst = svc_release_skb,
+ .xpo_release_rqst = svc_release_udp_skb,
.xpo_detach = svc_sock_detach,
.xpo_free = svc_sock_free,
.xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
@@ -1294,6 +1306,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class,
&svsk->sk_xprt, serv);
set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
+ set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
if (sk->sk_state == TCP_LISTEN) {
dprintk("setting up TCP socket for listening\n");
set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
@@ -1622,6 +1635,7 @@ static struct svc_xprt *svc_bc_create_socket(struct svc_serv *serv,
xprt = &svsk->sk_xprt;
svc_xprt_init(net, &svc_tcp_bc_class, xprt, serv);
+ set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
serv->sv_bc_xprt = xprt;
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index c88d9bc06f5c..8c3936403fea 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -14,7 +14,7 @@
#include <linux/sysctl.h>
#include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/sunrpc/types.h>
#include <linux/sunrpc/sched.h>
#include <linux/sunrpc/stats.h>
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 7f1071e103ca..1f7082144e01 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1518,3 +1518,37 @@ out:
}
EXPORT_SYMBOL_GPL(xdr_process_buf);
+/**
+ * xdr_stream_decode_string_dup - Decode and duplicate variable length string
+ * @xdr: pointer to xdr_stream
+ * @str: location to store pointer to string
+ * @maxlen: maximum acceptable string length
+ * @gfp_flags: GFP mask to use
+ *
+ * Return values:
+ * On success, returns length of NUL-terminated string stored in *@ptr
+ * %-EBADMSG on XDR buffer overflow
+ * %-EMSGSIZE if the size of the string would exceed @maxlen
+ * %-ENOMEM on memory allocation failure
+ */
+ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
+ size_t maxlen, gfp_t gfp_flags)
+{
+ void *p;
+ ssize_t ret;
+
+ ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen);
+ if (ret > 0) {
+ char *s = kmalloc(ret + 1, gfp_flags);
+ if (s != NULL) {
+ memcpy(s, p, ret);
+ s[ret] = '\0';
+ *str = s;
+ return strlen(s);
+ }
+ ret = -ENOMEM;
+ }
+ *str = NULL;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_decode_string_dup);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 685e6d225414..b530a2852ba8 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -669,7 +669,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
spin_lock_bh(&xprt->transport_lock);
if (cookie != xprt->connect_cookie)
goto out;
- if (test_bit(XPRT_CLOSING, &xprt->state) || !xprt_connected(xprt))
+ if (test_bit(XPRT_CLOSING, &xprt->state))
goto out;
set_bit(XPRT_CLOSE_WAIT, &xprt->state);
/* Try to schedule an autoclose RPC call */
@@ -772,6 +772,7 @@ void xprt_connect(struct rpc_task *task)
if (!xprt_connected(xprt)) {
task->tk_rqstp->rq_bytes_sent = 0;
task->tk_timeout = task->tk_rqstp->rq_timeout;
+ task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie;
rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
if (test_bit(XPRT_CLOSING, &xprt->state))
@@ -896,13 +897,11 @@ static void xprt_timer(struct rpc_task *task)
return;
dprintk("RPC: %5u xprt_timer\n", task->tk_pid);
- spin_lock_bh(&xprt->transport_lock);
if (!req->rq_reply_bytes_recvd) {
if (xprt->ops->timer)
xprt->ops->timer(xprt, task);
} else
task->tk_status = 0;
- spin_unlock_bh(&xprt->transport_lock);
}
/**
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 2c472e1b4827..24fedd4b117e 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -55,7 +55,8 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
if (IS_ERR(rb))
goto out_fail;
req->rl_sendbuf = rb;
- xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base, size);
+ xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base,
+ min_t(size_t, size, PAGE_SIZE));
rpcrdma_set_xprtdata(rqst, req);
return 0;
@@ -191,6 +192,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
size_t maxmsg;
maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize);
+ maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE);
return maxmsg - RPCRDMA_HDRLEN_MIN;
}
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 1ebb09e1ac4f..59e64025ed96 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -310,10 +310,7 @@ fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
struct rpcrdma_mw *mw;
while (!list_empty(&req->rl_registered)) {
- mw = list_first_entry(&req->rl_registered,
- struct rpcrdma_mw, mw_list);
- list_del_init(&mw->mw_list);
-
+ mw = rpcrdma_pop_mw(&req->rl_registered);
if (sync)
fmr_op_recover_mr(mw);
else
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 26b26beef2d4..f81dd93176c0 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -101,7 +101,7 @@ frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
struct rpcrdma_frmr *f = &r->frmr;
int rc;
- f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth);
+ f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
if (IS_ERR(f->fr_mr))
goto out_mr_err;
@@ -157,7 +157,7 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
return rc;
}
- f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG,
+ f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype,
ia->ri_max_frmr_depth);
if (IS_ERR(f->fr_mr)) {
pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
@@ -171,10 +171,6 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
}
/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR.
- *
- * There's no recovery if this fails. The FRMR is abandoned, but
- * remains in rb_all. It will be cleaned up when the transport is
- * destroyed.
*/
static void
frwr_op_recover_mr(struct rpcrdma_mw *mw)
@@ -210,11 +206,16 @@ static int
frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
+ struct ib_device_attr *attrs = &ia->ri_device->attrs;
int depth, delta;
+ ia->ri_mrtype = IB_MR_TYPE_MEM_REG;
+ if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+ ia->ri_mrtype = IB_MR_TYPE_SG_GAPS;
+
ia->ri_max_frmr_depth =
min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- ia->ri_device->attrs.max_fast_reg_page_list_len);
+ attrs->max_fast_reg_page_list_len);
dprintk("RPC: %s: device's max FR page list len = %u\n",
__func__, ia->ri_max_frmr_depth);
@@ -241,8 +242,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
}
ep->rep_attr.cap.max_send_wr *= depth;
- if (ep->rep_attr.cap.max_send_wr > ia->ri_device->attrs.max_qp_wr) {
- cdata->max_requests = ia->ri_device->attrs.max_qp_wr / depth;
+ if (ep->rep_attr.cap.max_send_wr > attrs->max_qp_wr) {
+ cdata->max_requests = attrs->max_qp_wr / depth;
if (!cdata->max_requests)
return -EINVAL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests *
@@ -348,6 +349,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int nsegs, bool writing, struct rpcrdma_mw **out)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS;
struct rpcrdma_mw *mw;
struct rpcrdma_frmr *frmr;
struct ib_mr *mr;
@@ -383,8 +385,8 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
++seg;
++i;
-
- /* Check for holes */
+ if (holes_ok)
+ continue;
if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
@@ -421,7 +423,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
IB_ACCESS_REMOTE_READ;
- DECR_CQCOUNT(&r_xprt->rx_ep);
+ rpcrdma_set_signaled(&r_xprt->rx_ep, &reg_wr->wr);
rc = ib_post_send(ia->ri_id->qp, &reg_wr->wr, &bad_wr);
if (rc)
goto out_senderr;
@@ -451,26 +453,6 @@ out_senderr:
return -ENOTCONN;
}
-static struct ib_send_wr *
-__frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
-{
- struct rpcrdma_frmr *f = &mw->frmr;
- struct ib_send_wr *invalidate_wr;
-
- dprintk("RPC: %s: invalidating frmr %p\n", __func__, f);
-
- f->fr_state = FRMR_IS_INVALID;
- invalidate_wr = &f->fr_invwr;
-
- memset(invalidate_wr, 0, sizeof(*invalidate_wr));
- f->fr_cqe.done = frwr_wc_localinv;
- invalidate_wr->wr_cqe = &f->fr_cqe;
- invalidate_wr->opcode = IB_WR_LOCAL_INV;
- invalidate_wr->ex.invalidate_rkey = f->fr_mr->rkey;
-
- return invalidate_wr;
-}
-
/* Invalidate all memory regions that were registered for "req".
*
* Sleeps until it is safe for the host CPU to access the
@@ -481,12 +463,12 @@ __frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
static void
frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
- struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
+ struct ib_send_wr *first, **prev, *last, *bad_wr;
struct rpcrdma_rep *rep = req->rl_reply;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_mw *mw, *tmp;
struct rpcrdma_frmr *f;
- int rc;
+ struct rpcrdma_mw *mw;
+ int count, rc;
dprintk("RPC: %s: req %p\n", __func__, req);
@@ -496,22 +478,29 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* a single ib_post_send() call.
*/
f = NULL;
- invalidate_wrs = pos = prev = NULL;
+ count = 0;
+ prev = &first;
list_for_each_entry(mw, &req->rl_registered, mw_list) {
+ mw->frmr.fr_state = FRMR_IS_INVALID;
+
if ((rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) &&
- (mw->mw_handle == rep->rr_inv_rkey)) {
- mw->frmr.fr_state = FRMR_IS_INVALID;
+ (mw->mw_handle == rep->rr_inv_rkey))
continue;
- }
-
- pos = __frwr_prepare_linv_wr(mw);
- if (!invalidate_wrs)
- invalidate_wrs = pos;
- else
- prev->next = pos;
- prev = pos;
f = &mw->frmr;
+ dprintk("RPC: %s: invalidating frmr %p\n",
+ __func__, f);
+
+ f->fr_cqe.done = frwr_wc_localinv;
+ last = &f->fr_invwr;
+ memset(last, 0, sizeof(*last));
+ last->wr_cqe = &f->fr_cqe;
+ last->opcode = IB_WR_LOCAL_INV;
+ last->ex.invalidate_rkey = mw->mw_handle;
+ count++;
+
+ *prev = last;
+ prev = &last->next;
}
if (!f)
goto unmap;
@@ -520,17 +509,22 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* last WR in the chain completes, all WRs in the chain
* are complete.
*/
- f->fr_invwr.send_flags = IB_SEND_SIGNALED;
+ last->send_flags = IB_SEND_SIGNALED;
f->fr_cqe.done = frwr_wc_localinv_wake;
reinit_completion(&f->fr_linv_done);
- INIT_CQCOUNT(&r_xprt->rx_ep);
+
+ /* Initialize CQ count, since there is always a signaled
+ * WR being posted here. The new cqcount depends on how
+ * many SQEs are about to be consumed.
+ */
+ rpcrdma_init_cqcount(&r_xprt->rx_ep, count);
/* Transport disconnect drains the receive CQ before it
* replaces the QP. The RPC reply handler won't call us
* unless ri_id->qp is a valid pointer.
*/
r_xprt->rx_stats.local_inv_needed++;
- rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
+ rc = ib_post_send(ia->ri_id->qp, first, &bad_wr);
if (rc)
goto reset_mrs;
@@ -540,10 +534,10 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* them to the free MW list.
*/
unmap:
- list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
- dprintk("RPC: %s: unmapping frmr %p\n",
+ while (!list_empty(&req->rl_registered)) {
+ mw = rpcrdma_pop_mw(&req->rl_registered);
+ dprintk("RPC: %s: DMA unmapping frmr %p\n",
__func__, &mw->frmr);
- list_del_init(&mw->mw_list);
ib_dma_unmap_sg(ia->ri_device,
mw->mw_sg, mw->mw_nents, mw->mw_dir);
rpcrdma_put_mw(r_xprt, mw);
@@ -559,7 +553,7 @@ reset_mrs:
*/
list_for_each_entry(mw, &req->rl_registered, mw_list) {
f = &mw->frmr;
- if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
+ if (mw->mw_handle == bad_wr->ex.invalidate_rkey) {
__frwr_reset_mr(ia, mw);
bad_wr = bad_wr->next;
}
@@ -577,10 +571,7 @@ frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
struct rpcrdma_mw *mw;
while (!list_empty(&req->rl_registered)) {
- mw = list_first_entry(&req->rl_registered,
- struct rpcrdma_mw, mw_list);
- list_del_init(&mw->mw_list);
-
+ mw = rpcrdma_pop_mw(&req->rl_registered);
if (sync)
frwr_op_recover_mr(mw);
else
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index d987c2d3dd6e..a044be2d6ad7 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -125,14 +125,34 @@ void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
/* The client can send a request inline as long as the RPCRDMA header
* plus the RPC call fit under the transport's inline limit. If the
* combined call message size exceeds that limit, the client must use
- * the read chunk list for this operation.
+ * a Read chunk for this operation.
+ *
+ * A Read chunk is also required if sending the RPC call inline would
+ * exceed this device's max_sge limit.
*/
static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
struct rpc_rqst *rqst)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct xdr_buf *xdr = &rqst->rq_snd_buf;
+ unsigned int count, remaining, offset;
+
+ if (xdr->len > r_xprt->rx_ia.ri_max_inline_write)
+ return false;
+
+ if (xdr->page_len) {
+ remaining = xdr->page_len;
+ offset = xdr->page_base & ~PAGE_MASK;
+ count = 0;
+ while (remaining) {
+ remaining -= min_t(unsigned int,
+ PAGE_SIZE - offset, remaining);
+ offset = 0;
+ if (++count > r_xprt->rx_ia.ri_max_send_sges)
+ return false;
+ }
+ }
- return rqst->rq_snd_buf.len <= ia->ri_max_inline_write;
+ return true;
}
/* The client can't know how large the actual reply will be. Thus it
@@ -186,9 +206,9 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
*/
static int
-rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
- enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg,
- bool reminv_expected)
+rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
+ unsigned int pos, enum rpcrdma_chunktype type,
+ struct rpcrdma_mr_seg *seg)
{
int len, n, p, page_base;
struct page **ppages;
@@ -226,22 +246,21 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
if (len && n == RPCRDMA_MAX_SEGS)
goto out_overflow;
- /* When encoding the read list, the tail is always sent inline */
- if (type == rpcrdma_readch)
+ /* When encoding a Read chunk, the tail iovec contains an
+ * XDR pad and may be omitted.
+ */
+ if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup)
return n;
- /* When encoding the Write list, some servers need to see an extra
- * segment for odd-length Write chunks. The upper layer provides
- * space in the tail iovec for this purpose.
+ /* When encoding a Write chunk, some servers need to see an
+ * extra segment for non-XDR-aligned Write chunks. The upper
+ * layer provides space in the tail iovec that may be used
+ * for this purpose.
*/
- if (type == rpcrdma_writech && reminv_expected)
+ if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup)
return n;
if (xdrbuf->tail[0].iov_len) {
- /* the rpcrdma protocol allows us to omit any trailing
- * xdr pad bytes, saving the server an RDMA operation. */
- if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
- return n;
n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n);
if (n == RPCRDMA_MAX_SEGS)
goto out_overflow;
@@ -293,7 +312,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
if (rtype == rpcrdma_areadch)
pos = 0;
seg = req->rl_segments;
- nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, false);
+ nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos,
+ rtype, seg);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -302,7 +322,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
false, &mw);
if (n < 0)
return ERR_PTR(n);
- list_add(&mw->mw_list, &req->rl_registered);
+ rpcrdma_push_mw(mw, &req->rl_registered);
*iptr++ = xdr_one; /* item present */
@@ -355,10 +375,9 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
}
seg = req->rl_segments;
- nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
+ nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
rqst->rq_rcv_buf.head[0].iov_len,
- wtype, seg,
- r_xprt->rx_ia.ri_reminv_expected);
+ wtype, seg);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -371,7 +390,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
true, &mw);
if (n < 0)
return ERR_PTR(n);
- list_add(&mw->mw_list, &req->rl_registered);
+ rpcrdma_push_mw(mw, &req->rl_registered);
iptr = xdr_encode_rdma_segment(iptr, mw);
@@ -423,8 +442,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
}
seg = req->rl_segments;
- nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
- r_xprt->rx_ia.ri_reminv_expected);
+ nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -437,7 +455,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
true, &mw);
if (n < 0)
return ERR_PTR(n);
- list_add(&mw->mw_list, &req->rl_registered);
+ rpcrdma_push_mw(mw, &req->rl_registered);
iptr = xdr_encode_rdma_segment(iptr, mw);
@@ -741,13 +759,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
iptr = headerp->rm_body.rm_chunks;
iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
if (IS_ERR(iptr))
- goto out_unmap;
+ goto out_err;
iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype);
if (IS_ERR(iptr))
- goto out_unmap;
+ goto out_err;
iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype);
if (IS_ERR(iptr))
- goto out_unmap;
+ goto out_err;
hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
@@ -758,12 +776,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen,
&rqst->rq_snd_buf, rtype)) {
iptr = ERR_PTR(-EIO);
- goto out_unmap;
+ goto out_err;
}
return 0;
-out_unmap:
- r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
+out_err:
+ pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n",
+ PTR_ERR(iptr));
+ r_xprt->rx_stats.failed_marshal_count++;
return PTR_ERR(iptr);
}
@@ -786,7 +806,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp)
ifdebug(FACILITY) {
u64 off;
xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
- dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n",
+ dprintk("RPC: %s: chunk %d@0x%016llx:0x%08x\n",
__func__,
be32_to_cpu(seg->rs_length),
(unsigned long long)off,
@@ -906,28 +926,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
return fixup_copy_count;
}
-void
-rpcrdma_connect_worker(struct work_struct *work)
-{
- struct rpcrdma_ep *ep =
- container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
- struct rpcrdma_xprt *r_xprt =
- container_of(ep, struct rpcrdma_xprt, rx_ep);
- struct rpc_xprt *xprt = &r_xprt->rx_xprt;
-
- spin_lock_bh(&xprt->transport_lock);
- if (++xprt->connect_cookie == 0) /* maintain a reserved value */
- ++xprt->connect_cookie;
- if (ep->rep_connected > 0) {
- if (!xprt_test_and_set_connected(xprt))
- xprt_wake_pending_tasks(xprt, 0);
- } else {
- if (xprt_test_and_clear_connected(xprt))
- xprt_wake_pending_tasks(xprt, -ENOTCONN);
- }
- spin_unlock_bh(&xprt->transport_lock);
-}
-
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
/* By convention, backchannel calls arrive via rdma_msg type
* messages, and never populate the chunk lists. This makes
@@ -959,18 +957,6 @@ rpcrdma_is_bcall(struct rpcrdma_msg *headerp)
}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
-/*
- * This function is called when an async event is posted to
- * the connection which changes the connection state. All it
- * does at this point is mark the connection up/down, the rpc
- * timers do the rest.
- */
-void
-rpcrdma_conn_func(struct rpcrdma_ep *ep)
-{
- schedule_delayed_work(&ep->rep_connect_worker, 0);
-}
-
/* Process received RPC/RDMA messages.
*
* Errors must result in the RPC task either being awakened, or
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 20027f8de129..ff1df40f0d26 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -4,6 +4,7 @@
* Support for backward direction RPCs on RPC/RDMA (server-side).
*/
+#include <linux/module.h>
#include <linux/sunrpc/svc_rdma.h>
#include "xprt_rdma.h"
@@ -164,13 +165,9 @@ static int
xprt_rdma_bc_allocate(struct rpc_task *task)
{
struct rpc_rqst *rqst = task->tk_rqstp;
- struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
size_t size = rqst->rq_callsize;
- struct svcxprt_rdma *rdma;
struct page *page;
- rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
-
if (size > PAGE_SIZE) {
WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
size);
@@ -204,19 +201,20 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
{
struct rpc_xprt *xprt = rqst->rq_xprt;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer;
+ __be32 *p;
int rc;
/* Space in the send buffer for an RPC/RDMA header is reserved
* via xprt->tsh_size.
*/
- headerp->rm_xid = rqst->rq_xid;
- headerp->rm_vers = rpcrdma_version;
- headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
- headerp->rm_type = rdma_msg;
- headerp->rm_body.rm_chunks[0] = xdr_zero;
- headerp->rm_body.rm_chunks[1] = xdr_zero;
- headerp->rm_body.rm_chunks[2] = xdr_zero;
+ p = rqst->rq_buffer;
+ *p++ = rqst->rq_xid;
+ *p++ = rpcrdma_version;
+ *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
+ *p++ = rdma_msg;
+ *p++ = xdr_zero;
+ *p++ = xdr_zero;
+ *p = xdr_zero;
#ifdef SVCRDMA_BACKCHANNEL_DEBUG
pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
@@ -359,6 +357,7 @@ xprt_setup_rdma_bc(struct xprt_create *args)
out_fail:
xprt_rdma_free_addresses(xprt);
args->bc_xprt->xpt_bc_xprt = NULL;
+ args->bc_xprt->xpt_bc_xps = NULL;
xprt_put(xprt);
xprt_free(xprt);
return ERR_PTR(-EINVAL);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 0ba9887f3e22..1c4aabf0f657 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
* Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
@@ -47,102 +48,43 @@
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
-/*
- * Decodes a read chunk list. The expected format is as follows:
- * descrim : xdr_one
- * position : __be32 offset into XDR stream
- * handle : __be32 RKEY
- * . . .
- * end-of-list: xdr_zero
- */
-static __be32 *decode_read_list(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_read_list(__be32 *p, __be32 *end)
{
- struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
+ __be32 *next;
- while (ch->rc_discrim != xdr_zero) {
- if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) >
- (unsigned long)vaend) {
- dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
+ while (*p++ != xdr_zero) {
+ next = p + rpcrdma_readchunk_maxsz - 1;
+ if (next > end)
return NULL;
- }
- ch++;
+ p = next;
}
- return &ch->rc_position;
+ return p;
}
-/*
- * Decodes a write chunk list. The expected format is as follows:
- * descrim : xdr_one
- * nchunks : <count>
- * handle : __be32 RKEY ---+
- * length : __be32 <len of segment> |
- * offset : remove va + <count>
- * . . . |
- * ---+
- */
-static __be32 *decode_write_list(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_write_list(__be32 *p, __be32 *end)
{
- unsigned long start, end;
- int nchunks;
-
- struct rpcrdma_write_array *ary =
- (struct rpcrdma_write_array *)va;
+ __be32 *next;
- /* Check for not write-array */
- if (ary->wc_discrim == xdr_zero)
- return &ary->wc_nchunks;
-
- if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
- (unsigned long)vaend) {
- dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
- return NULL;
- }
- nchunks = be32_to_cpu(ary->wc_nchunks);
-
- start = (unsigned long)&ary->wc_array[0];
- end = (unsigned long)vaend;
- if (nchunks < 0 ||
- nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
- (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
- dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
- ary, nchunks, vaend);
- return NULL;
+ while (*p++ != xdr_zero) {
+ next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+ if (next > end)
+ return NULL;
+ p = next;
}
- /*
- * rs_length is the 2nd 4B field in wc_target and taking its
- * address skips the list terminator
- */
- return &ary->wc_array[nchunks].wc_target.rs_length;
+ return p;
}
-static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_reply_chunk(__be32 *p, __be32 *end)
{
- unsigned long start, end;
- int nchunks;
- struct rpcrdma_write_array *ary =
- (struct rpcrdma_write_array *)va;
-
- /* Check for no reply-array */
- if (ary->wc_discrim == xdr_zero)
- return &ary->wc_nchunks;
-
- if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
- (unsigned long)vaend) {
- dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
- return NULL;
- }
- nchunks = be32_to_cpu(ary->wc_nchunks);
-
- start = (unsigned long)&ary->wc_array[0];
- end = (unsigned long)vaend;
- if (nchunks < 0 ||
- nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
- (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
- dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
- ary, nchunks, vaend);
- return NULL;
+ __be32 *next;
+
+ if (*p++ != xdr_zero) {
+ next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+ if (next > end)
+ return NULL;
+ p = next;
}
- return (__be32 *)&ary->wc_array[nchunks];
+ return p;
}
/**
@@ -158,87 +100,71 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
*/
int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
{
- struct rpcrdma_msg *rmsgp;
- __be32 *va, *vaend;
- unsigned int len;
- u32 hdr_len;
+ __be32 *p, *end, *rdma_argp;
+ unsigned int hdr_len;
/* Verify that there's enough bytes for header + something */
- if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) {
- dprintk("svcrdma: header too short = %d\n",
- rq_arg->len);
- return -EINVAL;
- }
+ if (rq_arg->len <= RPCRDMA_HDRLEN_ERR)
+ goto out_short;
- rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base;
- if (rmsgp->rm_vers != rpcrdma_version) {
- dprintk("%s: bad version %u\n", __func__,
- be32_to_cpu(rmsgp->rm_vers));
- return -EPROTONOSUPPORT;
- }
+ rdma_argp = rq_arg->head[0].iov_base;
+ if (*(rdma_argp + 1) != rpcrdma_version)
+ goto out_version;
- switch (be32_to_cpu(rmsgp->rm_type)) {
- case RDMA_MSG:
- case RDMA_NOMSG:
+ switch (*(rdma_argp + 3)) {
+ case rdma_msg:
+ case rdma_nomsg:
break;
- case RDMA_DONE:
- /* Just drop it */
- dprintk("svcrdma: dropping RDMA_DONE message\n");
- return 0;
-
- case RDMA_ERROR:
- /* Possible if this is a backchannel reply.
- * XXX: We should cancel this XID, though.
- */
- dprintk("svcrdma: dropping RDMA_ERROR message\n");
- return 0;
-
- case RDMA_MSGP:
- /* Pull in the extra for the padded case, bump our pointer */
- rmsgp->rm_body.rm_padded.rm_align =
- be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align);
- rmsgp->rm_body.rm_padded.rm_thresh =
- be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
-
- va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
- rq_arg->head[0].iov_base = va;
- len = (u32)((unsigned long)va - (unsigned long)rmsgp);
- rq_arg->head[0].iov_len -= len;
- if (len > rq_arg->len)
- return -EINVAL;
- return len;
- default:
- dprintk("svcrdma: bad rdma procedure (%u)\n",
- be32_to_cpu(rmsgp->rm_type));
- return -EINVAL;
- }
+ case rdma_done:
+ goto out_drop;
- /* The chunk list may contain either a read chunk list or a write
- * chunk list and a reply chunk list.
- */
- va = &rmsgp->rm_body.rm_chunks[0];
- vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len);
- va = decode_read_list(va, vaend);
- if (!va) {
- dprintk("svcrdma: failed to decode read list\n");
- return -EINVAL;
- }
- va = decode_write_list(va, vaend);
- if (!va) {
- dprintk("svcrdma: failed to decode write list\n");
- return -EINVAL;
- }
- va = decode_reply_array(va, vaend);
- if (!va) {
- dprintk("svcrdma: failed to decode reply chunk\n");
- return -EINVAL;
+ case rdma_error:
+ goto out_drop;
+
+ default:
+ goto out_proc;
}
- rq_arg->head[0].iov_base = va;
- hdr_len = (unsigned long)va - (unsigned long)rmsgp;
+ end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len);
+ p = xdr_check_read_list(rdma_argp + 4, end);
+ if (!p)
+ goto out_inval;
+ p = xdr_check_write_list(p, end);
+ if (!p)
+ goto out_inval;
+ p = xdr_check_reply_chunk(p, end);
+ if (!p)
+ goto out_inval;
+ if (p > end)
+ goto out_inval;
+
+ rq_arg->head[0].iov_base = p;
+ hdr_len = (unsigned long)p - (unsigned long)rdma_argp;
rq_arg->head[0].iov_len -= hdr_len;
return hdr_len;
+
+out_short:
+ dprintk("svcrdma: header too short = %d\n", rq_arg->len);
+ return -EINVAL;
+
+out_version:
+ dprintk("svcrdma: bad xprt version: %u\n",
+ be32_to_cpup(rdma_argp + 1));
+ return -EPROTONOSUPPORT;
+
+out_drop:
+ dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n");
+ return 0;
+
+out_proc:
+ dprintk("svcrdma: bad rdma procedure (%u)\n",
+ be32_to_cpup(rdma_argp + 3));
+ return -EINVAL;
+
+out_inval:
+ dprintk("svcrdma: failed to parse transport header\n");
+ return -EINVAL;
}
int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
@@ -249,7 +175,7 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
*va++ = rmsgp->rm_xid;
*va++ = rmsgp->rm_vers;
- *va++ = cpu_to_be32(xprt->sc_max_requests);
+ *va++ = xprt->sc_fc_credits;
*va++ = rdma_error;
*va++ = cpu_to_be32(err);
if (err == ERR_VERS) {
@@ -260,32 +186,35 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
return (int)((unsigned long)va - (unsigned long)startp);
}
-int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
+/**
+ * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
+ * @rdma_resp: buffer containing Reply transport header
+ *
+ * Returns length of transport header, in bytes.
+ */
+unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
{
- struct rpcrdma_write_array *wr_ary;
+ unsigned int nsegs;
+ __be32 *p;
- /* There is no read-list in a reply */
+ p = rdma_resp;
- /* skip write list */
- wr_ary = (struct rpcrdma_write_array *)
- &rmsgp->rm_body.rm_chunks[1];
- if (wr_ary->wc_discrim)
- wr_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)].
- wc_target.rs_length;
- else
- wr_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_nchunks;
-
- /* skip reply array */
- if (wr_ary->wc_discrim)
- wr_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)];
- else
- wr_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_nchunks;
-
- return (unsigned long) wr_ary - (unsigned long) rmsgp;
+ /* RPC-over-RDMA V1 replies never have a Read list. */
+ p += rpcrdma_fixed_maxsz + 1;
+
+ /* Skip Write list. */
+ while (*p++ != xdr_zero) {
+ nsegs = be32_to_cpup(p++);
+ p += nsegs * rpcrdma_segment_maxsz;
+ }
+
+ /* Skip Reply chunk. */
+ if (*p++ != xdr_zero) {
+ nsegs = be32_to_cpup(p++);
+ p += nsegs * rpcrdma_segment_maxsz;
+ }
+
+ return (unsigned long)p - (unsigned long)rdma_resp;
}
void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
@@ -326,19 +255,3 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
seg->rs_offset = rs_offset;
seg->rs_length = cpu_to_be32(write_len);
}
-
-void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
- struct rpcrdma_msg *rdma_argp,
- struct rpcrdma_msg *rdma_resp,
- enum rpcrdma_proc rdma_type)
-{
- rdma_resp->rm_xid = rdma_argp->rm_xid;
- rdma_resp->rm_vers = rdma_argp->rm_vers;
- rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests);
- rdma_resp->rm_type = cpu_to_be32(rdma_type);
-
- /* Encode <nul> chunks lists */
- rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
- rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
- rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
-}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index ad1df979b3f0..f7b2daf72a86 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -279,7 +279,6 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
frmr->sg);
return -ENOMEM;
}
- atomic_inc(&xprt->sc_dma_used);
n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE);
if (unlikely(n != frmr->sg_nents)) {
@@ -348,8 +347,6 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
atomic_inc(&rdma_stat_read);
return ret;
err:
- ib_dma_unmap_sg(xprt->sc_cm_id->device,
- frmr->sg, frmr->sg_nents, frmr->direction);
svc_rdma_put_context(ctxt, 0);
svc_rdma_put_frmr(xprt, frmr);
return ret;
@@ -374,9 +371,7 @@ rdma_copy_tail(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head,
u32 position, u32 byte_count, u32 page_offset, int page_no)
{
char *srcp, *destp;
- int ret;
- ret = 0;
srcp = head->arg.head[0].iov_base + position;
byte_count = head->arg.head[0].iov_len - position;
if (byte_count > PAGE_SIZE) {
@@ -415,6 +410,20 @@ done:
return 1;
}
+/* Returns the address of the first read chunk or <nul> if no read chunk
+ * is present
+ */
+static struct rpcrdma_read_chunk *
+svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp)
+{
+ struct rpcrdma_read_chunk *ch =
+ (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+
+ if (ch->rc_discrim == xdr_zero)
+ return NULL;
+ return ch;
+}
+
static int rdma_read_chunks(struct svcxprt_rdma *xprt,
struct rpcrdma_msg *rmsgp,
struct svc_rqst *rqstp,
@@ -597,26 +606,24 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
dprintk("svcrdma: rqstp=%p\n", rqstp);
- spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
+ spin_lock(&rdma_xprt->sc_rq_dto_lock);
if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
- ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
- struct svc_rdma_op_ctxt,
- dto_q);
- list_del_init(&ctxt->dto_q);
- spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+ ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q,
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
+ spin_unlock(&rdma_xprt->sc_rq_dto_lock);
rdma_read_complete(rqstp, ctxt);
goto complete;
} else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
- ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
- struct svc_rdma_op_ctxt,
- dto_q);
- list_del_init(&ctxt->dto_q);
+ ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q,
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
} else {
atomic_inc(&rdma_stat_rq_starve);
clear_bit(XPT_DATA, &xprt->xpt_flags);
ctxt = NULL;
}
- spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+ spin_unlock(&rdma_xprt->sc_rq_dto_lock);
if (!ctxt) {
/* This is the EAGAIN path. The svc_recv routine will
* return -EAGAIN, the nfsd thread will go to call into
@@ -627,8 +634,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
goto defer;
goto out;
}
- dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
- ctxt, rdma_xprt, rqstp, ctxt->wc_status);
+ dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p\n",
+ ctxt, rdma_xprt, rqstp);
atomic_inc(&rdma_stat_recv);
/* Build up the XDR from the receive buffers. */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index f5a91edcd233..515221b16d09 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -153,76 +153,35 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
return dma_addr;
}
-/* Returns the address of the first read chunk or <nul> if no read chunk
- * is present
+/* Parse the RPC Call's transport header.
*/
-struct rpcrdma_read_chunk *
-svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp)
+static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
+ struct rpcrdma_write_array **write,
+ struct rpcrdma_write_array **reply)
{
- struct rpcrdma_read_chunk *ch =
- (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+ __be32 *p;
- if (ch->rc_discrim == xdr_zero)
- return NULL;
- return ch;
-}
-
-/* Returns the address of the first read write array element or <nul>
- * if no write array list is present
- */
-static struct rpcrdma_write_array *
-svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp)
-{
- if (rmsgp->rm_body.rm_chunks[0] != xdr_zero ||
- rmsgp->rm_body.rm_chunks[1] == xdr_zero)
- return NULL;
- return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1];
-}
-
-/* Returns the address of the first reply array element or <nul> if no
- * reply array is present
- */
-static struct rpcrdma_write_array *
-svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp,
- struct rpcrdma_write_array *wr_ary)
-{
- struct rpcrdma_read_chunk *rch;
- struct rpcrdma_write_array *rp_ary;
-
- /* XXX: Need to fix when reply chunk may occur with read list
- * and/or write list.
- */
- if (rmsgp->rm_body.rm_chunks[0] != xdr_zero ||
- rmsgp->rm_body.rm_chunks[1] != xdr_zero)
- return NULL;
-
- rch = svc_rdma_get_read_chunk(rmsgp);
- if (rch) {
- while (rch->rc_discrim != xdr_zero)
- rch++;
-
- /* The reply chunk follows an empty write array located
- * at 'rc_position' here. The reply array is at rc_target.
- */
- rp_ary = (struct rpcrdma_write_array *)&rch->rc_target;
- goto found_it;
- }
+ p = (__be32 *)&rmsgp->rm_body.rm_chunks[0];
- if (wr_ary) {
- int chunk = be32_to_cpu(wr_ary->wc_nchunks);
+ /* Read list */
+ while (*p++ != xdr_zero)
+ p += 5;
- rp_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_array[chunk].wc_target.rs_length;
- goto found_it;
+ /* Write list */
+ if (*p != xdr_zero) {
+ *write = (struct rpcrdma_write_array *)p;
+ while (*p++ != xdr_zero)
+ p += 1 + be32_to_cpu(*p) * 4;
+ } else {
+ *write = NULL;
+ p++;
}
- /* No read list, no write list */
- rp_ary = (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[2];
-
- found_it:
- if (rp_ary->wc_discrim == xdr_zero)
- return NULL;
- return rp_ary;
+ /* Reply chunk */
+ if (*p != xdr_zero)
+ *reply = (struct rpcrdma_write_array *)p;
+ else
+ *reply = NULL;
}
/* RPC-over-RDMA Version One private extension: Remote Invalidation.
@@ -240,31 +199,22 @@ static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp,
{
struct rpcrdma_read_chunk *rd_ary;
struct rpcrdma_segment *arg_ch;
- u32 inv_rkey;
-
- inv_rkey = 0;
- rd_ary = svc_rdma_get_read_chunk(rdma_argp);
- if (rd_ary) {
- inv_rkey = be32_to_cpu(rd_ary->rc_target.rs_handle);
- goto out;
- }
+ rd_ary = (struct rpcrdma_read_chunk *)&rdma_argp->rm_body.rm_chunks[0];
+ if (rd_ary->rc_discrim != xdr_zero)
+ return be32_to_cpu(rd_ary->rc_target.rs_handle);
if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) {
arg_ch = &wr_ary->wc_array[0].wc_target;
- inv_rkey = be32_to_cpu(arg_ch->rs_handle);
- goto out;
+ return be32_to_cpu(arg_ch->rs_handle);
}
if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) {
arg_ch = &rp_ary->wc_array[0].wc_target;
- inv_rkey = be32_to_cpu(arg_ch->rs_handle);
- goto out;
+ return be32_to_cpu(arg_ch->rs_handle);
}
-out:
- dprintk("svcrdma: Send With Invalidate rkey=%08x\n", inv_rkey);
- return inv_rkey;
+ return 0;
}
/* Assumptions:
@@ -526,7 +476,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
/* Prepare the SGE for the RPCRDMA Header */
ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
- ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
+ ctxt->sge[0].length =
+ svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp);
ctxt->sge[0].addr =
ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
ctxt->sge[0].length, DMA_TO_DEVICE);
@@ -609,12 +560,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
struct rpcrdma_msg *rdma_argp;
struct rpcrdma_msg *rdma_resp;
struct rpcrdma_write_array *wr_ary, *rp_ary;
- enum rpcrdma_proc reply_type;
int ret;
int inline_bytes;
struct page *res_page;
struct svc_rdma_req_map *vec;
u32 inv_rkey;
+ __be32 *p;
dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
@@ -622,8 +573,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
* places this at the start of page 0.
*/
rdma_argp = page_address(rqstp->rq_pages[0]);
- wr_ary = svc_rdma_get_write_array(rdma_argp);
- rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary);
+ svc_rdma_get_write_arrays(rdma_argp, &wr_ary, &rp_ary);
inv_rkey = 0;
if (rdma->sc_snd_w_inv)
@@ -636,18 +586,28 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
goto err0;
inline_bytes = rqstp->rq_res.len;
- /* Create the RDMA response header */
+ /* Create the RDMA response header. xprt->xpt_mutex,
+ * acquired in svc_send(), serializes RPC replies. The
+ * code path below that inserts the credit grant value
+ * into each transport header runs only inside this
+ * critical section.
+ */
ret = -ENOMEM;
res_page = alloc_page(GFP_KERNEL);
if (!res_page)
goto err0;
rdma_resp = page_address(res_page);
- if (rp_ary)
- reply_type = RDMA_NOMSG;
- else
- reply_type = RDMA_MSG;
- svc_rdma_xdr_encode_reply_header(rdma, rdma_argp,
- rdma_resp, reply_type);
+
+ p = &rdma_resp->rm_xid;
+ *p++ = rdma_argp->rm_xid;
+ *p++ = rdma_argp->rm_vers;
+ *p++ = rdma->sc_fc_credits;
+ *p++ = rp_ary ? rdma_nomsg : rdma_msg;
+
+ /* Start with empty chunks */
+ *p++ = xdr_zero;
+ *p++ = xdr_zero;
+ *p = xdr_zero;
/* Send any write-chunk data and build resp write-list */
if (wr_ary) {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 1334de2715c2..fc8f14c7bfec 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -41,6 +41,7 @@
*/
#include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/debug.h>
#include <linux/sunrpc/rpc_rdma.h>
#include <linux/interrupt.h>
@@ -126,6 +127,7 @@ static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *serv,
xprt = &cma_xprt->sc_xprt;
svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv);
+ set_bit(XPT_CONG_CTRL, &xprt->xpt_flags);
serv->sv_bc_xprt = xprt;
dprintk("svcrdma: %s(%p)\n", __func__, xprt);
@@ -156,8 +158,7 @@ static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
ctxt = kmalloc(sizeof(*ctxt), flags);
if (ctxt) {
ctxt->xprt = xprt;
- INIT_LIST_HEAD(&ctxt->free);
- INIT_LIST_HEAD(&ctxt->dto_q);
+ INIT_LIST_HEAD(&ctxt->list);
}
return ctxt;
}
@@ -179,7 +180,7 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
dprintk("svcrdma: No memory for RDMA ctxt\n");
return false;
}
- list_add(&ctxt->free, &xprt->sc_ctxts);
+ list_add(&ctxt->list, &xprt->sc_ctxts);
}
return true;
}
@@ -188,15 +189,15 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
{
struct svc_rdma_op_ctxt *ctxt = NULL;
- spin_lock_bh(&xprt->sc_ctxt_lock);
+ spin_lock(&xprt->sc_ctxt_lock);
xprt->sc_ctxt_used++;
if (list_empty(&xprt->sc_ctxts))
goto out_empty;
ctxt = list_first_entry(&xprt->sc_ctxts,
- struct svc_rdma_op_ctxt, free);
- list_del_init(&ctxt->free);
- spin_unlock_bh(&xprt->sc_ctxt_lock);
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
+ spin_unlock(&xprt->sc_ctxt_lock);
out:
ctxt->count = 0;
@@ -208,15 +209,15 @@ out_empty:
/* Either pre-allocation missed the mark, or send
* queue accounting is broken.
*/
- spin_unlock_bh(&xprt->sc_ctxt_lock);
+ spin_unlock(&xprt->sc_ctxt_lock);
ctxt = alloc_ctxt(xprt, GFP_NOIO);
if (ctxt)
goto out;
- spin_lock_bh(&xprt->sc_ctxt_lock);
+ spin_lock(&xprt->sc_ctxt_lock);
xprt->sc_ctxt_used--;
- spin_unlock_bh(&xprt->sc_ctxt_lock);
+ spin_unlock(&xprt->sc_ctxt_lock);
WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
return NULL;
}
@@ -226,25 +227,22 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
struct svcxprt_rdma *xprt = ctxt->xprt;
struct ib_device *device = xprt->sc_cm_id->device;
u32 lkey = xprt->sc_pd->local_dma_lkey;
- unsigned int i, count;
+ unsigned int i;
- for (count = 0, i = 0; i < ctxt->mapped_sges; i++) {
+ for (i = 0; i < ctxt->mapped_sges; i++) {
/*
* Unmap the DMA addr in the SGE if the lkey matches
* the local_dma_lkey, otherwise, ignore it since it is
* an FRMR lkey and will be unmapped later when the
* last WR that uses it completes.
*/
- if (ctxt->sge[i].lkey == lkey) {
- count++;
+ if (ctxt->sge[i].lkey == lkey)
ib_dma_unmap_page(device,
ctxt->sge[i].addr,
ctxt->sge[i].length,
ctxt->direction);
- }
}
ctxt->mapped_sges = 0;
- atomic_sub(count, &xprt->sc_dma_used);
}
void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
@@ -256,10 +254,10 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
for (i = 0; i < ctxt->count; i++)
put_page(ctxt->pages[i]);
- spin_lock_bh(&xprt->sc_ctxt_lock);
+ spin_lock(&xprt->sc_ctxt_lock);
xprt->sc_ctxt_used--;
- list_add(&ctxt->free, &xprt->sc_ctxts);
- spin_unlock_bh(&xprt->sc_ctxt_lock);
+ list_add(&ctxt->list, &xprt->sc_ctxts);
+ spin_unlock(&xprt->sc_ctxt_lock);
}
static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
@@ -268,8 +266,8 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
struct svc_rdma_op_ctxt *ctxt;
ctxt = list_first_entry(&xprt->sc_ctxts,
- struct svc_rdma_op_ctxt, free);
- list_del(&ctxt->free);
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
kfree(ctxt);
}
}
@@ -398,7 +396,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
/* WARNING: Only wc->wr_cqe and wc->status are reliable */
ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
- ctxt->wc_status = wc->status;
svc_rdma_unmap_dma(ctxt);
if (wc->status != IB_WC_SUCCESS)
@@ -407,7 +404,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
/* All wc fields are now known to be valid */
ctxt->byte_len = wc->byte_len;
spin_lock(&xprt->sc_rq_dto_lock);
- list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
+ list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q);
spin_unlock(&xprt->sc_rq_dto_lock);
set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
@@ -436,7 +433,7 @@ static void svc_rdma_send_wc_common(struct svcxprt_rdma *xprt,
goto err;
out:
- atomic_dec(&xprt->sc_sq_count);
+ atomic_inc(&xprt->sc_sq_avail);
wake_up(&xprt->sc_send_wait);
return;
@@ -528,7 +525,7 @@ void svc_rdma_wc_read(struct ib_cq *cq, struct ib_wc *wc)
read_hdr = ctxt->read_hdr;
spin_lock(&xprt->sc_rq_dto_lock);
- list_add_tail(&read_hdr->dto_q,
+ list_add_tail(&read_hdr->list,
&xprt->sc_read_complete_q);
spin_unlock(&xprt->sc_rq_dto_lock);
@@ -560,7 +557,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
return NULL;
svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
- INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
@@ -574,6 +570,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
spin_lock_init(&cma_xprt->sc_ctxt_lock);
spin_lock_init(&cma_xprt->sc_map_lock);
+ /*
+ * Note that this implies that the underlying transport support
+ * has some form of congestion control (see RFC 7530 section 3.1
+ * paragraph 2). For now, we assume that all supported RDMA
+ * transports are suitable here.
+ */
+ set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
+
if (listener)
set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
@@ -926,14 +930,14 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
{
struct svc_rdma_fastreg_mr *frmr = NULL;
- spin_lock_bh(&rdma->sc_frmr_q_lock);
+ spin_lock(&rdma->sc_frmr_q_lock);
if (!list_empty(&rdma->sc_frmr_q)) {
frmr = list_entry(rdma->sc_frmr_q.next,
struct svc_rdma_fastreg_mr, frmr_list);
list_del_init(&frmr->frmr_list);
frmr->sg_nents = 0;
}
- spin_unlock_bh(&rdma->sc_frmr_q_lock);
+ spin_unlock(&rdma->sc_frmr_q_lock);
if (frmr)
return frmr;
@@ -946,11 +950,10 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
if (frmr) {
ib_dma_unmap_sg(rdma->sc_cm_id->device,
frmr->sg, frmr->sg_nents, frmr->direction);
- atomic_dec(&rdma->sc_dma_used);
- spin_lock_bh(&rdma->sc_frmr_q_lock);
+ spin_lock(&rdma->sc_frmr_q_lock);
WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
- spin_unlock_bh(&rdma->sc_frmr_q_lock);
+ spin_unlock(&rdma->sc_frmr_q_lock);
}
}
@@ -973,6 +976,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
struct rpcrdma_connect_private pmsg;
struct ib_qp_init_attr qp_attr;
struct ib_device *dev;
+ struct sockaddr *sap;
unsigned int i;
int ret = 0;
@@ -1005,11 +1009,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_max_req_size = svcrdma_max_req_size;
newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
svcrdma_max_requests);
+ newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
svcrdma_max_bc_requests);
newxprt->sc_rq_depth = newxprt->sc_max_requests +
newxprt->sc_max_bc_requests;
newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
+ atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
if (!svc_rdma_prealloc_ctxts(newxprt))
goto errout;
@@ -1029,13 +1035,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
goto errout;
}
newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
- 0, IB_POLL_SOFTIRQ);
+ 0, IB_POLL_WORKQUEUE);
if (IS_ERR(newxprt->sc_sq_cq)) {
dprintk("svcrdma: error creating SQ CQ for connect request\n");
goto errout;
}
newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth,
- 0, IB_POLL_SOFTIRQ);
+ 0, IB_POLL_WORKQUEUE);
if (IS_ERR(newxprt->sc_rq_cq)) {
dprintk("svcrdma: error creating RQ CQ for connect request\n");
goto errout;
@@ -1052,18 +1058,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
qp_attr.qp_type = IB_QPT_RC;
qp_attr.send_cq = newxprt->sc_sq_cq;
qp_attr.recv_cq = newxprt->sc_rq_cq;
- dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n"
- " cm_id->device=%p, sc_pd->device=%p\n"
- " cap.max_send_wr = %d\n"
- " cap.max_recv_wr = %d\n"
- " cap.max_send_sge = %d\n"
- " cap.max_recv_sge = %d\n",
- newxprt->sc_cm_id, newxprt->sc_pd,
- dev, newxprt->sc_pd->device,
- qp_attr.cap.max_send_wr,
- qp_attr.cap.max_recv_wr,
- qp_attr.cap.max_send_sge,
- qp_attr.cap.max_recv_sge);
+ dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n",
+ newxprt->sc_cm_id, newxprt->sc_pd);
+ dprintk(" cap.max_send_wr = %d, cap.max_recv_wr = %d\n",
+ qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr);
+ dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n",
+ qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge);
ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
if (ret) {
@@ -1146,31 +1146,16 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
goto errout;
}
- dprintk("svcrdma: new connection %p accepted with the following "
- "attributes:\n"
- " local_ip : %pI4\n"
- " local_port : %d\n"
- " remote_ip : %pI4\n"
- " remote_port : %d\n"
- " max_sge : %d\n"
- " max_sge_rd : %d\n"
- " sq_depth : %d\n"
- " max_requests : %d\n"
- " ord : %d\n",
- newxprt,
- &((struct sockaddr_in *)&newxprt->sc_cm_id->
- route.addr.src_addr)->sin_addr.s_addr,
- ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
- route.addr.src_addr)->sin_port),
- &((struct sockaddr_in *)&newxprt->sc_cm_id->
- route.addr.dst_addr)->sin_addr.s_addr,
- ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
- route.addr.dst_addr)->sin_port),
- newxprt->sc_max_sge,
- newxprt->sc_max_sge_rd,
- newxprt->sc_sq_depth,
- newxprt->sc_max_requests,
- newxprt->sc_ord);
+ dprintk("svcrdma: new connection %p accepted:\n", newxprt);
+ sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
+ dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap));
+ sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
+ dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap));
+ dprintk(" max_sge : %d\n", newxprt->sc_max_sge);
+ dprintk(" max_sge_rd : %d\n", newxprt->sc_max_sge_rd);
+ dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth);
+ dprintk(" max_requests : %d\n", newxprt->sc_max_requests);
+ dprintk(" ord : %d\n", newxprt->sc_ord);
return &newxprt->sc_xprt;
@@ -1224,9 +1209,9 @@ static void __svc_rdma_free(struct work_struct *work)
ib_drain_qp(rdma->sc_qp);
/* We should only be called from kref_put */
- if (atomic_read(&xprt->xpt_ref.refcount) != 0)
+ if (kref_read(&xprt->xpt_ref) != 0)
pr_err("svcrdma: sc_xprt still in use? (%d)\n",
- atomic_read(&xprt->xpt_ref.refcount));
+ kref_read(&xprt->xpt_ref));
/*
* Destroy queued, but not processed read completions. Note
@@ -1236,20 +1221,18 @@ static void __svc_rdma_free(struct work_struct *work)
*/
while (!list_empty(&rdma->sc_read_complete_q)) {
struct svc_rdma_op_ctxt *ctxt;
- ctxt = list_entry(rdma->sc_read_complete_q.next,
- struct svc_rdma_op_ctxt,
- dto_q);
- list_del_init(&ctxt->dto_q);
+ ctxt = list_first_entry(&rdma->sc_read_complete_q,
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
svc_rdma_put_context(ctxt, 1);
}
/* Destroy queued, but not processed recv completions */
while (!list_empty(&rdma->sc_rq_dto_q)) {
struct svc_rdma_op_ctxt *ctxt;
- ctxt = list_entry(rdma->sc_rq_dto_q.next,
- struct svc_rdma_op_ctxt,
- dto_q);
- list_del_init(&ctxt->dto_q);
+ ctxt = list_first_entry(&rdma->sc_rq_dto_q,
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
svc_rdma_put_context(ctxt, 1);
}
@@ -1257,9 +1240,6 @@ static void __svc_rdma_free(struct work_struct *work)
if (rdma->sc_ctxt_used != 0)
pr_err("svcrdma: ctxt still in use? (%d)\n",
rdma->sc_ctxt_used);
- if (atomic_read(&rdma->sc_dma_used) != 0)
- pr_err("svcrdma: dma still in use? (%d)\n",
- atomic_read(&rdma->sc_dma_used));
/* Final put of backchannel client transport */
if (xprt->xpt_bc_xprt) {
@@ -1339,15 +1319,13 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
/* If the SQ is full, wait until an SQ entry is available */
while (1) {
- spin_lock_bh(&xprt->sc_lock);
- if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) {
- spin_unlock_bh(&xprt->sc_lock);
+ if ((atomic_sub_return(wr_count, &xprt->sc_sq_avail) < 0)) {
atomic_inc(&rdma_stat_sq_starve);
/* Wait until SQ WR available if SQ still full */
+ atomic_add(wr_count, &xprt->sc_sq_avail);
wait_event(xprt->sc_send_wait,
- atomic_read(&xprt->sc_sq_count) <
- xprt->sc_sq_depth);
+ atomic_read(&xprt->sc_sq_avail) > wr_count);
if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
return -ENOTCONN;
continue;
@@ -1357,21 +1335,17 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
svc_xprt_get(&xprt->sc_xprt);
/* Bump used SQ WR count and post */
- atomic_add(wr_count, &xprt->sc_sq_count);
ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
if (ret) {
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
- atomic_sub(wr_count, &xprt->sc_sq_count);
for (i = 0; i < wr_count; i ++)
svc_xprt_put(&xprt->sc_xprt);
- dprintk("svcrdma: failed to post SQ WR rc=%d, "
- "sc_sq_count=%d, sc_sq_depth=%d\n",
- ret, atomic_read(&xprt->sc_sq_count),
- xprt->sc_sq_depth);
- }
- spin_unlock_bh(&xprt->sc_lock);
- if (ret)
+ dprintk("svcrdma: failed to post SQ WR rc=%d\n", ret);
+ dprintk(" sc_sq_avail=%d, sc_sq_depth=%d\n",
+ atomic_read(&xprt->sc_sq_avail),
+ xprt->sc_sq_depth);
wake_up(&xprt->sc_send_wait);
+ }
break;
}
return ret;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index ed5e285fd2ea..c717f5410776 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -67,7 +67,7 @@ unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_inline_write_padding;
static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
- int xprt_rdma_pad_optimize = 1;
+ int xprt_rdma_pad_optimize = 0;
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -219,6 +219,34 @@ xprt_rdma_free_addresses(struct rpc_xprt *xprt)
}
}
+void
+rpcrdma_conn_func(struct rpcrdma_ep *ep)
+{
+ schedule_delayed_work(&ep->rep_connect_worker, 0);
+}
+
+void
+rpcrdma_connect_worker(struct work_struct *work)
+{
+ struct rpcrdma_ep *ep =
+ container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
+ struct rpcrdma_xprt *r_xprt =
+ container_of(ep, struct rpcrdma_xprt, rx_ep);
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+ spin_lock_bh(&xprt->transport_lock);
+ if (++xprt->connect_cookie == 0) /* maintain a reserved value */
+ ++xprt->connect_cookie;
+ if (ep->rep_connected > 0) {
+ if (!xprt_test_and_set_connected(xprt))
+ xprt_wake_pending_tasks(xprt, 0);
+ } else {
+ if (xprt_test_and_clear_connected(xprt))
+ xprt_wake_pending_tasks(xprt, -ENOTCONN);
+ }
+ spin_unlock_bh(&xprt->transport_lock);
+}
+
static void
xprt_rdma_connect_worker(struct work_struct *work)
{
@@ -621,7 +649,8 @@ xprt_rdma_free(struct rpc_task *task)
dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
- ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task));
+ if (unlikely(!list_empty(&req->rl_registered)))
+ ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task));
rpcrdma_unmap_sges(ia, req);
rpcrdma_buffer_put(req);
}
@@ -657,7 +686,8 @@ xprt_rdma_send_request(struct rpc_task *task)
int rc = 0;
/* On retransmit, remove any previously registered chunks */
- r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
+ if (unlikely(!list_empty(&req->rl_registered)))
+ r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
rc = rpcrdma_marshal_req(rqst);
if (rc < 0)
@@ -679,10 +709,6 @@ xprt_rdma_send_request(struct rpc_task *task)
return 0;
failed_marshal:
- dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
- __func__, rc);
- if (rc == -EIO)
- r_xprt->rx_stats.failed_marshal_count++;
if (rc != -ENOTCONN)
return rc;
drop_connection:
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index ec74289af7ec..3b332b395045 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -54,6 +54,7 @@
#include <linux/sunrpc/svc_rdma.h>
#include <asm/bitops.h>
#include <linux/module.h> /* try_module_get()/module_put() */
+#include <rdma/ib_cm.h>
#include "xprt_rdma.h"
@@ -103,9 +104,9 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
{
struct rpcrdma_ep *ep = context;
- pr_err("RPC: %s: %s on device %s ep %p\n",
- __func__, ib_event_msg(event->event),
- event->device->name, context);
+ pr_err("rpcrdma: %s on device %s ep %p\n",
+ ib_event_msg(event->event), event->device->name, context);
+
if (ep->rep_connected == 1) {
ep->rep_connected = -EIO;
rpcrdma_conn_func(ep);
@@ -208,6 +209,7 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
/* Default settings for RPC-over-RDMA Version One */
r_xprt->rx_ia.ri_reminv_expected = false;
+ r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
@@ -215,6 +217,7 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
pmsg->cp_magic == rpcrdma_cmp_magic &&
pmsg->cp_version == RPCRDMA_CMP_VERSION) {
r_xprt->rx_ia.ri_reminv_expected = true;
+ r_xprt->rx_ia.ri_implicit_roundup = true;
rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
}
@@ -223,8 +226,8 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
cdata->inline_rsize = rsize;
if (wsize < cdata->inline_wsize)
cdata->inline_wsize = wsize;
- pr_info("rpcrdma: max send %u, max recv %u\n",
- cdata->inline_wsize, cdata->inline_rsize);
+ dprintk("RPC: %s: max send %u, max recv %u\n",
+ __func__, cdata->inline_wsize, cdata->inline_rsize);
rpcrdma_set_max_header_sizes(r_xprt);
}
@@ -277,7 +280,14 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
connstate = -ENETDOWN;
goto connected;
case RDMA_CM_EVENT_REJECTED:
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+ pr_info("rpcrdma: connection to %pIS:%u on %s rejected: %s\n",
+ sap, rpc_get_port(sap), ia->ri_device->name,
+ rdma_reject_msg(id, event->status));
+#endif
connstate = -ECONNREFUSED;
+ if (event->status == IB_CM_REJ_STALE_CONN)
+ connstate = -EAGAIN;
goto connected;
case RDMA_CM_EVENT_DISCONNECTED:
connstate = -ECONNABORTED;
@@ -331,6 +341,7 @@ static struct rdma_cm_id *
rpcrdma_create_id(struct rpcrdma_xprt *xprt,
struct rpcrdma_ia *ia, struct sockaddr *addr)
{
+ unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
struct rdma_cm_id *id;
int rc;
@@ -352,8 +363,12 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
__func__, rc);
goto out;
}
- wait_for_completion_interruptible_timeout(&ia->ri_done,
- msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+ rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+ if (rc < 0) {
+ dprintk("RPC: %s: wait() exited: %i\n",
+ __func__, rc);
+ goto out;
+ }
/* FIXME:
* Until xprtrdma supports DEVICE_REMOVAL, the provider must
@@ -376,8 +391,12 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
__func__, rc);
goto put;
}
- wait_for_completion_interruptible_timeout(&ia->ri_done,
- msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+ rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+ if (rc < 0) {
+ dprintk("RPC: %s: wait() exited: %i\n",
+ __func__, rc);
+ goto put;
+ }
rc = ia->ri_async_rc;
if (rc)
goto put;
@@ -477,18 +496,20 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
*/
int
rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
- struct rpcrdma_create_data_internal *cdata)
+ struct rpcrdma_create_data_internal *cdata)
{
struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
+ unsigned int max_qp_wr, max_sge;
struct ib_cq *sendcq, *recvcq;
- unsigned int max_qp_wr;
int rc;
- if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) {
- dprintk("RPC: %s: insufficient sge's available\n",
- __func__);
+ max_sge = min_t(unsigned int, ia->ri_device->attrs.max_sge,
+ RPCRDMA_MAX_SEND_SGES);
+ if (max_sge < RPCRDMA_MIN_SEND_SGES) {
+ pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
return -ENOMEM;
}
+ ia->ri_max_send_sges = max_sge - RPCRDMA_MIN_SEND_SGES;
if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
dprintk("RPC: %s: insufficient wqe's available\n",
@@ -513,7 +534,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */
- ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES;
+ ep->rep_attr.cap.max_send_sge = max_sge;
ep->rep_attr.cap.max_recv_sge = 1;
ep->rep_attr.cap.max_inline_data = 0;
ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -532,7 +553,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
if (ep->rep_cqinit <= 2)
ep->rep_cqinit = 0; /* always signal? */
- INIT_CQCOUNT(ep);
+ rpcrdma_init_cqcount(ep, 0);
init_waitqueue_head(&ep->rep_connect_wait);
INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
@@ -631,20 +652,21 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
int
rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
+ struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
+ rx_ia);
struct rdma_cm_id *id, *old;
+ struct sockaddr *sap;
+ unsigned int extras;
int rc = 0;
- int retry_count = 0;
if (ep->rep_connected != 0) {
- struct rpcrdma_xprt *xprt;
retry:
dprintk("RPC: %s: reconnecting...\n", __func__);
rpcrdma_ep_disconnect(ep, ia);
- xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
- id = rpcrdma_create_id(xprt, ia,
- (struct sockaddr *)&xprt->rx_data.addr);
+ sap = (struct sockaddr *)&r_xprt->rx_data.addr;
+ id = rpcrdma_create_id(r_xprt, ia, sap);
if (IS_ERR(id)) {
rc = -EHOSTUNREACH;
goto out;
@@ -699,51 +721,18 @@ retry:
}
wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
-
- /*
- * Check state. A non-peer reject indicates no listener
- * (ECONNREFUSED), which may be a transient state. All
- * others indicate a transport condition which has already
- * undergone a best-effort.
- */
- if (ep->rep_connected == -ECONNREFUSED &&
- ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
- dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
- goto retry;
- }
if (ep->rep_connected <= 0) {
- /* Sometimes, the only way to reliably connect to remote
- * CMs is to use same nonzero values for ORD and IRD. */
- if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
- (ep->rep_remote_cma.responder_resources == 0 ||
- ep->rep_remote_cma.initiator_depth !=
- ep->rep_remote_cma.responder_resources)) {
- if (ep->rep_remote_cma.responder_resources == 0)
- ep->rep_remote_cma.responder_resources = 1;
- ep->rep_remote_cma.initiator_depth =
- ep->rep_remote_cma.responder_resources;
+ if (ep->rep_connected == -EAGAIN)
goto retry;
- }
rc = ep->rep_connected;
- } else {
- struct rpcrdma_xprt *r_xprt;
- unsigned int extras;
-
- dprintk("RPC: %s: connected\n", __func__);
-
- r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
- extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
-
- if (extras) {
- rc = rpcrdma_ep_post_extra_recv(r_xprt, extras);
- if (rc) {
- pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n",
- __func__, rc);
- rc = 0;
- }
- }
+ goto out;
}
+ dprintk("RPC: %s: connected\n", __func__);
+ extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
+ if (extras)
+ rpcrdma_ep_post_extra_recv(r_xprt, extras);
+
out:
if (rc)
ep->rep_connected = rc;
@@ -788,9 +777,7 @@ rpcrdma_mr_recovery_worker(struct work_struct *work)
spin_lock(&buf->rb_recovery_lock);
while (!list_empty(&buf->rb_stale_mrs)) {
- mw = list_first_entry(&buf->rb_stale_mrs,
- struct rpcrdma_mw, mw_list);
- list_del_init(&mw->mw_list);
+ mw = rpcrdma_pop_mw(&buf->rb_stale_mrs);
spin_unlock(&buf->rb_recovery_lock);
dprintk("RPC: %s: recovering MR %p\n", __func__, mw);
@@ -808,7 +795,7 @@ rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw)
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
spin_lock(&buf->rb_recovery_lock);
- list_add(&mw->mw_list, &buf->rb_stale_mrs);
+ rpcrdma_push_mw(mw, &buf->rb_stale_mrs);
spin_unlock(&buf->rb_recovery_lock);
schedule_delayed_work(&buf->rb_recovery_worker, 0);
@@ -1084,11 +1071,8 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_mw *mw = NULL;
spin_lock(&buf->rb_mwlock);
- if (!list_empty(&buf->rb_mws)) {
- mw = list_first_entry(&buf->rb_mws,
- struct rpcrdma_mw, mw_list);
- list_del_init(&mw->mw_list);
- }
+ if (!list_empty(&buf->rb_mws))
+ mw = rpcrdma_pop_mw(&buf->rb_mws);
spin_unlock(&buf->rb_mwlock);
if (!mw)
@@ -1111,7 +1095,7 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
spin_lock(&buf->rb_mwlock);
- list_add_tail(&mw->mw_list, &buf->rb_mws);
+ rpcrdma_push_mw(mw, &buf->rb_mws);
spin_unlock(&buf->rb_mwlock);
}
@@ -1311,13 +1295,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
dprintk("RPC: %s: posting %d s/g entries\n",
__func__, send_wr->num_sge);
- if (DECR_CQCOUNT(ep) > 0)
- send_wr->send_flags = 0;
- else { /* Provider must take a send completion every now and then */
- INIT_CQCOUNT(ep);
- send_wr->send_flags = IB_SEND_SIGNALED;
- }
-
+ rpcrdma_set_signaled(ep, send_wr);
rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail);
if (rc)
goto out_postsend_err;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 6e1bba358203..171a35116de9 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -74,7 +74,10 @@ struct rpcrdma_ia {
unsigned int ri_max_frmr_depth;
unsigned int ri_max_inline_write;
unsigned int ri_max_inline_read;
+ unsigned int ri_max_send_sges;
bool ri_reminv_expected;
+ bool ri_implicit_roundup;
+ enum ib_mr_type ri_mrtype;
struct ib_qp_attr ri_qp_attr;
struct ib_qp_init_attr ri_qp_init_attr;
};
@@ -95,8 +98,24 @@ struct rpcrdma_ep {
struct delayed_work rep_connect_worker;
};
-#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
-#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
+static inline void
+rpcrdma_init_cqcount(struct rpcrdma_ep *ep, int count)
+{
+ atomic_set(&ep->rep_cqcount, ep->rep_cqinit - count);
+}
+
+/* To update send queue accounting, provider must take a
+ * send completion every now and then.
+ */
+static inline void
+rpcrdma_set_signaled(struct rpcrdma_ep *ep, struct ib_send_wr *send_wr)
+{
+ send_wr->send_flags = 0;
+ if (unlikely(atomic_sub_return(1, &ep->rep_cqcount) <= 0)) {
+ rpcrdma_init_cqcount(ep, 0);
+ send_wr->send_flags = IB_SEND_SIGNALED;
+ }
+}
/* Pre-allocate extra Work Requests for handling backward receives
* and sends. This is a fixed value because the Work Queues are
@@ -286,15 +305,19 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
char *mr_offset; /* kva if no page, else offset */
};
-/* Reserve enough Send SGEs to send a maximum size inline request:
+/* The Send SGE array is provisioned to send a maximum size
+ * inline request:
* - RPC-over-RDMA header
* - xdr_buf head iovec
- * - RPCRDMA_MAX_INLINE bytes, possibly unaligned, in pages
+ * - RPCRDMA_MAX_INLINE bytes, in pages
* - xdr_buf tail iovec
+ *
+ * The actual number of array elements consumed by each RPC
+ * depends on the device's max_sge limit.
*/
enum {
- RPCRDMA_MAX_SEND_PAGES = PAGE_SIZE + RPCRDMA_MAX_INLINE - 1,
- RPCRDMA_MAX_PAGE_SGES = (RPCRDMA_MAX_SEND_PAGES >> PAGE_SHIFT) + 1,
+ RPCRDMA_MIN_SEND_SGES = 3,
+ RPCRDMA_MAX_PAGE_SGES = RPCRDMA_MAX_INLINE >> PAGE_SHIFT,
RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1,
};
@@ -331,6 +354,22 @@ rpcr_to_rdmar(struct rpc_rqst *rqst)
return rqst->rq_xprtdata;
}
+static inline void
+rpcrdma_push_mw(struct rpcrdma_mw *mw, struct list_head *list)
+{
+ list_add_tail(&mw->mw_list, list);
+}
+
+static inline struct rpcrdma_mw *
+rpcrdma_pop_mw(struct list_head *list)
+{
+ struct rpcrdma_mw *mw;
+
+ mw = list_first_entry(list, struct rpcrdma_mw, mw_list);
+ list_del(&mw->mw_list);
+ return mw;
+}
+
/*
* struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
* inline requests/replies, and client/server credits.
@@ -473,6 +512,7 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
struct rpcrdma_create_data_internal *);
void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_conn_func(struct rpcrdma_ep *ep);
void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
@@ -532,13 +572,6 @@ rpcrdma_data_dir(bool writing)
}
/*
- * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
- */
-void rpcrdma_connect_worker(struct work_struct *);
-void rpcrdma_conn_func(struct rpcrdma_ep *);
-void rpcrdma_reply_handler(struct work_struct *);
-
-/*
* RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
*/
@@ -555,12 +588,14 @@ bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *,
void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *);
int rpcrdma_marshal_req(struct rpc_rqst *);
void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
+void rpcrdma_reply_handler(struct work_struct *work);
/* RPC/RDMA module init - xprtrdma/transport.c
*/
extern unsigned int xprt_rdma_max_inline_read;
void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
+void rpcrdma_connect_worker(struct work_struct *work);
void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
int xprt_rdma_init(void);
void xprt_rdma_cleanup(void);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e01c825bc683..16aff8ddc16f 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -52,6 +52,8 @@
#include "sunrpc.h"
static void xs_close(struct rpc_xprt *xprt);
+static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
+ struct socket *sock);
/*
* xprtsock tunables
@@ -666,6 +668,9 @@ static int xs_tcp_send_request(struct rpc_task *task)
if (task->tk_flags & RPC_TASK_SENT)
zerocopy = false;
+ if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state))
+ xs_tcp_set_socket_timeouts(xprt, transport->sock);
+
/* Continue transmitting the packet/record. We must be careful
* to cope with writespace callbacks arriving _after_ we have
* called sendmsg(). */
@@ -1080,10 +1085,10 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
if (sk == NULL)
goto out;
for (;;) {
- skb = skb_recv_datagram(sk, 0, 1, &err);
+ skb = skb_recv_udp(sk, 0, 1, &err);
if (skb != NULL) {
xs_udp_data_read_skb(&transport->xprt, sk, skb);
- skb_free_datagram_locked(sk, skb);
+ consume_skb(skb);
continue;
}
if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
@@ -1188,7 +1193,7 @@ static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_r
char *p;
len = sizeof(transport->tcp_xid) - transport->tcp_offset;
- dprintk("RPC: reading XID (%Zu bytes)\n", len);
+ dprintk("RPC: reading XID (%zu bytes)\n", len);
p = ((char *) &transport->tcp_xid) + transport->tcp_offset;
used = xdr_skb_read_bits(desc, p, len);
transport->tcp_offset += used;
@@ -1219,7 +1224,7 @@ static inline void xs_tcp_read_calldir(struct sock_xprt *transport,
*/
offset = transport->tcp_offset - sizeof(transport->tcp_xid);
len = sizeof(transport->tcp_calldir) - offset;
- dprintk("RPC: reading CALL/REPLY flag (%Zu bytes)\n", len);
+ dprintk("RPC: reading CALL/REPLY flag (%zu bytes)\n", len);
p = ((char *) &transport->tcp_calldir) + offset;
used = xdr_skb_read_bits(desc, p, len);
transport->tcp_offset += used;
@@ -1310,7 +1315,7 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt,
return;
}
- dprintk("RPC: XID %08x read %Zd bytes\n",
+ dprintk("RPC: XID %08x read %zd bytes\n",
ntohl(transport->tcp_xid), r);
dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, "
"tcp_reclen = %u\n", xprt, transport->tcp_copied,
@@ -1456,7 +1461,7 @@ static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_s
desc->count -= len;
desc->offset += len;
transport->tcp_offset += len;
- dprintk("RPC: discarded %Zu bytes\n", len);
+ dprintk("RPC: discarded %zu bytes\n", len);
xs_tcp_check_fraghdr(transport);
}
@@ -1734,7 +1739,9 @@ static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t
*/
static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task)
{
+ spin_lock_bh(&xprt->transport_lock);
xprt_adjust_cwnd(xprt, task, -ETIMEDOUT);
+ spin_unlock_bh(&xprt->transport_lock);
}
static unsigned short xs_get_random_port(void)
@@ -2235,6 +2242,66 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt)
xs_reset_transport(transport);
}
+static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
+ struct socket *sock)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ unsigned int keepidle;
+ unsigned int keepcnt;
+ unsigned int opt_on = 1;
+ unsigned int timeo;
+
+ spin_lock_bh(&xprt->transport_lock);
+ keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ);
+ keepcnt = xprt->timeout->to_retries + 1;
+ timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
+ (xprt->timeout->to_retries + 1);
+ clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state);
+ spin_unlock_bh(&xprt->transport_lock);
+
+ /* TCP Keepalive options */
+ kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+ (char *)&opt_on, sizeof(opt_on));
+ kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
+ (char *)&keepidle, sizeof(keepidle));
+ kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
+ (char *)&keepidle, sizeof(keepidle));
+ kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
+ (char *)&keepcnt, sizeof(keepcnt));
+
+ /* TCP user timeout (see RFC5482) */
+ kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
+ (char *)&timeo, sizeof(timeo));
+}
+
+static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt,
+ unsigned long connect_timeout,
+ unsigned long reconnect_timeout)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ struct rpc_timeout to;
+ unsigned long initval;
+
+ spin_lock_bh(&xprt->transport_lock);
+ if (reconnect_timeout < xprt->max_reconnect_timeout)
+ xprt->max_reconnect_timeout = reconnect_timeout;
+ if (connect_timeout < xprt->connect_timeout) {
+ memcpy(&to, xprt->timeout, sizeof(to));
+ initval = DIV_ROUND_UP(connect_timeout, to.to_retries + 1);
+ /* Arbitrary lower limit */
+ if (initval < XS_TCP_INIT_REEST_TO << 1)
+ initval = XS_TCP_INIT_REEST_TO << 1;
+ to.to_initval = initval;
+ to.to_maxval = initval;
+ memcpy(&transport->tcp_timeout, &to,
+ sizeof(transport->tcp_timeout));
+ xprt->timeout = &transport->tcp_timeout;
+ xprt->connect_timeout = connect_timeout;
+ }
+ set_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state);
+ spin_unlock_bh(&xprt->transport_lock);
+}
+
static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
{
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -2242,22 +2309,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
if (!transport->inet) {
struct sock *sk = sock->sk;
- unsigned int keepidle = xprt->timeout->to_initval / HZ;
- unsigned int keepcnt = xprt->timeout->to_retries + 1;
- unsigned int opt_on = 1;
- unsigned int timeo;
unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC;
- /* TCP Keepalive options */
- kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
- (char *)&opt_on, sizeof(opt_on));
- kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
- (char *)&keepidle, sizeof(keepidle));
- kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
- (char *)&keepidle, sizeof(keepidle));
- kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
- (char *)&keepcnt, sizeof(keepcnt));
-
/* Avoid temporary address, they are bad for long-lived
* connections such as NFS mounts.
* RFC4941, section 3.6 suggests that:
@@ -2268,11 +2321,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES,
(char *)&addr_pref, sizeof(addr_pref));
- /* TCP user timeout (see RFC5482) */
- timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
- (xprt->timeout->to_retries + 1);
- kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
- (char *)&timeo, sizeof(timeo));
+ xs_tcp_set_socket_timeouts(xprt, sock);
write_lock_bh(&sk->sk_callback_lock);
@@ -2721,6 +2770,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
.set_retrans_timeout = xprt_set_retrans_timeout_def,
.close = xs_tcp_shutdown,
.destroy = xs_destroy,
+ .set_connect_timeout = xs_tcp_set_connect_timeout,
.print_stats = xs_tcp_print_stats,
.enable_swap = xs_enable_swap,
.disable_swap = xs_disable_swap,
@@ -3007,6 +3057,8 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
xprt->timeout = &xs_tcp_default_timeout;
xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
+ xprt->connect_timeout = xprt->timeout->to_initval *
+ (xprt->timeout->to_retries + 1);
INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn);
INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
@@ -3209,7 +3261,9 @@ static int param_set_uint_minmax(const char *val,
if (!val)
return -EINVAL;
ret = kstrtouint(val, 0, &num);
- if (ret == -EINVAL || num < min || num > max)
+ if (ret)
+ return ret;
+ if (num < min || num > max)
return -EINVAL;
*((unsigned int *)kp->arg) = num;
return 0;
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 3b95fe980fa2..017801f9dbaa 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -624,13 +624,10 @@ EXPORT_SYMBOL_GPL(unregister_switchdev_notifier);
int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
struct switchdev_notifier_info *info)
{
- int err;
-
ASSERT_RTNL();
info->dev = dev;
- err = raw_notifier_call_chain(&switchdev_notif_chain, val, info);
- return err;
+ return raw_notifier_call_chain(&switchdev_notif_chain, val, info);
}
EXPORT_SYMBOL_GPL(call_switchdev_notifiers);
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index aa1babbea385..7d99029df342 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -1,7 +1,7 @@
/*
* net/tipc/bcast.c: TIPC broadcast code
*
- * Copyright (c) 2004-2006, 2014-2015, Ericsson AB
+ * Copyright (c) 2004-2006, 2014-2016, Ericsson AB
* Copyright (c) 2004, Intel Corporation.
* Copyright (c) 2005, 2010-2011, Wind River Systems
* All rights reserved.
@@ -39,9 +39,8 @@
#include "socket.h"
#include "msg.h"
#include "bcast.h"
-#include "name_distr.h"
#include "link.h"
-#include "node.h"
+#include "name_table.h"
#define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */
#define BCLINK_WIN_MIN 32 /* bcast minimum link window size */
@@ -54,12 +53,20 @@ const char tipc_bclink_name[] = "broadcast-link";
* @inputq: data input queue; will only carry SOCK_WAKEUP messages
* @dest: array keeping number of reachable destinations per bearer
* @primary_bearer: a bearer having links to all broadcast destinations, if any
+ * @bcast_support: indicates if primary bearer, if any, supports broadcast
+ * @rcast_support: indicates if all peer nodes support replicast
+ * @rc_ratio: dest count as percentage of cluster size where send method changes
+ * @bc_threshold: calculated drom rc_ratio; if dests > threshold use broadcast
*/
struct tipc_bc_base {
struct tipc_link *link;
struct sk_buff_head inputq;
int dests[MAX_BEARERS];
int primary_bearer;
+ bool bcast_support;
+ bool rcast_support;
+ int rc_ratio;
+ int bc_threshold;
};
static struct tipc_bc_base *tipc_bc_base(struct net *net)
@@ -69,7 +76,20 @@ static struct tipc_bc_base *tipc_bc_base(struct net *net)
int tipc_bcast_get_mtu(struct net *net)
{
- return tipc_link_mtu(tipc_bc_sndlink(net));
+ return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE;
+}
+
+void tipc_bcast_disable_rcast(struct net *net)
+{
+ tipc_bc_base(net)->rcast_support = false;
+}
+
+static void tipc_bcbase_calc_bc_threshold(struct net *net)
+{
+ struct tipc_bc_base *bb = tipc_bc_base(net);
+ int cluster_size = tipc_link_bc_peers(tipc_bc_sndlink(net));
+
+ bb->bc_threshold = 1 + (cluster_size * bb->rc_ratio / 100);
}
/* tipc_bcbase_select_primary(): find a bearer with links to all destinations,
@@ -79,9 +99,10 @@ static void tipc_bcbase_select_primary(struct net *net)
{
struct tipc_bc_base *bb = tipc_bc_base(net);
int all_dests = tipc_link_bc_peers(bb->link);
- int i, mtu;
+ int i, mtu, prim;
bb->primary_bearer = INVALID_BEARER_ID;
+ bb->bcast_support = true;
if (!all_dests)
return;
@@ -93,7 +114,7 @@ static void tipc_bcbase_select_primary(struct net *net)
mtu = tipc_bearer_mtu(net, i);
if (mtu < tipc_link_mtu(bb->link))
tipc_link_set_mtu(bb->link, mtu);
-
+ bb->bcast_support &= tipc_bearer_bcast_support(net, i);
if (bb->dests[i] < all_dests)
continue;
@@ -103,6 +124,9 @@ static void tipc_bcbase_select_primary(struct net *net)
if ((i ^ tipc_own_addr(net)) & 1)
break;
}
+ prim = bb->primary_bearer;
+ if (prim != INVALID_BEARER_ID)
+ bb->bcast_support = tipc_bearer_bcast_support(net, prim);
}
void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id)
@@ -170,45 +194,131 @@ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq)
__skb_queue_purge(&_xmitq);
}
-/* tipc_bcast_xmit - deliver buffer chain to all nodes in cluster
- * and to identified node local sockets
+static void tipc_bcast_select_xmit_method(struct net *net, int dests,
+ struct tipc_mc_method *method)
+{
+ struct tipc_bc_base *bb = tipc_bc_base(net);
+ unsigned long exp = method->expires;
+
+ /* Broadcast supported by used bearer/bearers? */
+ if (!bb->bcast_support) {
+ method->rcast = true;
+ return;
+ }
+ /* Any destinations which don't support replicast ? */
+ if (!bb->rcast_support) {
+ method->rcast = false;
+ return;
+ }
+ /* Can current method be changed ? */
+ method->expires = jiffies + TIPC_METHOD_EXPIRE;
+ if (method->mandatory || time_before(jiffies, exp))
+ return;
+
+ /* Determine method to use now */
+ method->rcast = dests <= bb->bc_threshold;
+}
+
+/* tipc_bcast_xmit - broadcast the buffer chain to all external nodes
* @net: the applicable net namespace
- * @list: chain of buffers containing message
- * Consumes the buffer chain, except when returning -ELINKCONG
- * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
+ * @pkts: chain of buffers containing message
+ * @cong_link_cnt: set to 1 if broadcast link is congested, otherwise 0
+ * Consumes the buffer chain.
+ * Returns 0 if success, otherwise errno: -EHOSTUNREACH,-EMSGSIZE
*/
-int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
+static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts,
+ u16 *cong_link_cnt)
{
struct tipc_link *l = tipc_bc_sndlink(net);
- struct sk_buff_head xmitq, inputq, rcvq;
+ struct sk_buff_head xmitq;
int rc = 0;
- __skb_queue_head_init(&rcvq);
__skb_queue_head_init(&xmitq);
- skb_queue_head_init(&inputq);
-
- /* Prepare message clone for local node */
- if (unlikely(!tipc_msg_reassemble(list, &rcvq)))
- return -EHOSTUNREACH;
-
tipc_bcast_lock(net);
if (tipc_link_bc_peers(l))
- rc = tipc_link_xmit(l, list, &xmitq);
+ rc = tipc_link_xmit(l, pkts, &xmitq);
tipc_bcast_unlock(net);
-
- /* Don't send to local node if adding to link failed */
- if (unlikely(rc)) {
- __skb_queue_purge(&rcvq);
- return rc;
+ tipc_bcbase_xmit(net, &xmitq);
+ __skb_queue_purge(pkts);
+ if (rc == -ELINKCONG) {
+ *cong_link_cnt = 1;
+ rc = 0;
}
+ return rc;
+}
- /* Broadcast to all nodes, inluding local node */
- tipc_bcbase_xmit(net, &xmitq);
- tipc_sk_mcast_rcv(net, &rcvq, &inputq);
- __skb_queue_purge(list);
+/* tipc_rcast_xmit - replicate and send a message to given destination nodes
+ * @net: the applicable net namespace
+ * @pkts: chain of buffers containing message
+ * @dests: list of destination nodes
+ * @cong_link_cnt: returns number of congested links
+ * @cong_links: returns identities of congested links
+ * Returns 0 if success, otherwise errno
+ */
+static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts,
+ struct tipc_nlist *dests, u16 *cong_link_cnt)
+{
+ struct sk_buff_head _pkts;
+ struct u32_item *n, *tmp;
+ u32 dst, selector;
+
+ selector = msg_link_selector(buf_msg(skb_peek(pkts)));
+ __skb_queue_head_init(&_pkts);
+
+ list_for_each_entry_safe(n, tmp, &dests->list, list) {
+ dst = n->value;
+ if (!tipc_msg_pskb_copy(dst, pkts, &_pkts))
+ return -ENOMEM;
+
+ /* Any other return value than -ELINKCONG is ignored */
+ if (tipc_node_xmit(net, &_pkts, dst, selector) == -ELINKCONG)
+ (*cong_link_cnt)++;
+ }
return 0;
}
+/* tipc_mcast_xmit - deliver message to indicated destination nodes
+ * and to identified node local sockets
+ * @net: the applicable net namespace
+ * @pkts: chain of buffers containing message
+ * @method: send method to be used
+ * @dests: destination nodes for message.
+ * @cong_link_cnt: returns number of encountered congested destination links
+ * Consumes buffer chain.
+ * Returns 0 if success, otherwise errno
+ */
+int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
+ struct tipc_mc_method *method, struct tipc_nlist *dests,
+ u16 *cong_link_cnt)
+{
+ struct sk_buff_head inputq, localq;
+ int rc = 0;
+
+ skb_queue_head_init(&inputq);
+ skb_queue_head_init(&localq);
+
+ /* Clone packets before they are consumed by next call */
+ if (dests->local && !tipc_msg_reassemble(pkts, &localq)) {
+ rc = -ENOMEM;
+ goto exit;
+ }
+ /* Send according to determined transmit method */
+ if (dests->remote) {
+ tipc_bcast_select_xmit_method(net, dests->remote, method);
+ if (method->rcast)
+ rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt);
+ else
+ rc = tipc_bcast_xmit(net, pkts, cong_link_cnt);
+ }
+
+ if (dests->local)
+ tipc_sk_mcast_rcv(net, &localq, &inputq);
+exit:
+ /* This queue should normally be empty by now */
+ __skb_queue_purge(pkts);
+ return rc;
+}
+
/* tipc_bcast_rcv - receive a broadcast packet, and deliver to rcv link
*
* RCU is locked, no other locks set
@@ -313,6 +423,7 @@ void tipc_bcast_add_peer(struct net *net, struct tipc_link *uc_l,
tipc_bcast_lock(net);
tipc_link_add_bc_peer(snd_l, uc_l, xmitq);
tipc_bcbase_select_primary(net);
+ tipc_bcbase_calc_bc_threshold(net);
tipc_bcast_unlock(net);
}
@@ -331,6 +442,7 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_l)
tipc_bcast_lock(net);
tipc_link_remove_bc_peer(snd_l, rcv_l, &xmitq);
tipc_bcbase_select_primary(net);
+ tipc_bcbase_calc_bc_threshold(net);
tipc_bcast_unlock(net);
tipc_bcbase_xmit(net, &xmitq);
@@ -413,6 +525,8 @@ int tipc_bcast_init(struct net *net)
goto enomem;
bb->link = l;
tn->bcl = l;
+ bb->rc_ratio = 25;
+ bb->rcast_support = true;
return 0;
enomem:
kfree(bb);
@@ -428,3 +542,33 @@ void tipc_bcast_stop(struct net *net)
kfree(tn->bcbase);
kfree(tn->bcl);
}
+
+void tipc_nlist_init(struct tipc_nlist *nl, u32 self)
+{
+ memset(nl, 0, sizeof(*nl));
+ INIT_LIST_HEAD(&nl->list);
+ nl->self = self;
+}
+
+void tipc_nlist_add(struct tipc_nlist *nl, u32 node)
+{
+ if (node == nl->self)
+ nl->local = true;
+ else if (u32_push(&nl->list, node))
+ nl->remote++;
+}
+
+void tipc_nlist_del(struct tipc_nlist *nl, u32 node)
+{
+ if (node == nl->self)
+ nl->local = false;
+ else if (u32_del(&nl->list, node))
+ nl->remote--;
+}
+
+void tipc_nlist_purge(struct tipc_nlist *nl)
+{
+ u32_list_purge(&nl->list);
+ nl->remote = 0;
+ nl->local = 0;
+}
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 855d53c64ab3..751530ab0c49 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -42,9 +42,35 @@
struct tipc_node;
struct tipc_msg;
struct tipc_nl_msg;
-struct tipc_node_map;
+struct tipc_nlist;
+struct tipc_nitem;
extern const char tipc_bclink_name[];
+#define TIPC_METHOD_EXPIRE msecs_to_jiffies(5000)
+
+struct tipc_nlist {
+ struct list_head list;
+ u32 self;
+ u16 remote;
+ bool local;
+};
+
+void tipc_nlist_init(struct tipc_nlist *nl, u32 self);
+void tipc_nlist_purge(struct tipc_nlist *nl);
+void tipc_nlist_add(struct tipc_nlist *nl, u32 node);
+void tipc_nlist_del(struct tipc_nlist *nl, u32 node);
+
+/* Cookie to be used between socket and broadcast layer
+ * @rcast: replicast (instead of broadcast) was used at previous xmit
+ * @mandatory: broadcast/replicast indication was set by user
+ * @expires: re-evaluate non-mandatory transmit method if we are past this
+ */
+struct tipc_mc_method {
+ bool rcast;
+ bool mandatory;
+ unsigned long expires;
+};
+
int tipc_bcast_init(struct net *net);
void tipc_bcast_stop(struct net *net);
void tipc_bcast_add_peer(struct net *net, struct tipc_link *l,
@@ -53,7 +79,10 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl);
void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id);
void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id);
int tipc_bcast_get_mtu(struct net *net);
-int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list);
+void tipc_bcast_disable_rcast(struct net *net);
+int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
+ struct tipc_mc_method *method, struct tipc_nlist *dests,
+ u16 *cong_link_cnt);
int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb);
void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l,
struct tipc_msg *hdr);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 52d74760fb68..33a5bdfbef76 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -431,7 +431,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
memset(&b->bcast_addr, 0, sizeof(b->bcast_addr));
memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len);
b->bcast_addr.media_id = b->media->type_id;
- b->bcast_addr.broadcast = 1;
+ b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT;
b->mtu = dev->mtu;
b->media->raw2addr(b, &b->addr, (char *)dev->dev_addr);
rcu_assign_pointer(dev->tipc_ptr, b);
@@ -482,6 +482,19 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
return 0;
}
+bool tipc_bearer_bcast_support(struct net *net, u32 bearer_id)
+{
+ bool supp = false;
+ struct tipc_bearer *b;
+
+ rcu_read_lock();
+ b = bearer_get(net, bearer_id);
+ if (b)
+ supp = (b->bcast_addr.broadcast == TIPC_BROADCAST_SUPPORT);
+ rcu_read_unlock();
+ return supp;
+}
+
int tipc_bearer_mtu(struct net *net, u32 bearer_id)
{
int mtu = 0;
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 278ff7f616f9..635c9086e19a 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -60,9 +60,14 @@
#define TIPC_MEDIA_TYPE_IB 2
#define TIPC_MEDIA_TYPE_UDP 3
-/* minimum bearer MTU */
+/* Minimum bearer MTU */
#define TIPC_MIN_BEARER_MTU (MAX_H_SIZE + INT_H_SIZE)
+/* Identifiers for distinguishing between broadcast/multicast and replicast
+ */
+#define TIPC_BROADCAST_SUPPORT 1
+#define TIPC_REPLICAST_SUPPORT 2
+
/**
* struct tipc_media_addr - destination address used by TIPC bearers
* @value: address info (format defined by media)
@@ -210,6 +215,7 @@ int tipc_bearer_setup(void);
void tipc_bearer_cleanup(void);
void tipc_bearer_stop(struct net *net);
int tipc_bearer_mtu(struct net *net, u32 bearer_id);
+bool tipc_bearer_bcast_support(struct net *net, u32 bearer_id);
void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id,
struct sk_buff *skb,
struct tipc_media_addr *dest);
diff --git a/net/tipc/core.c b/net/tipc/core.c
index 236b043a4156..0b982d048fb9 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -47,7 +47,7 @@
#include <linux/module.h>
/* configurable TIPC parameters */
-int tipc_net_id __read_mostly;
+unsigned int tipc_net_id __read_mostly;
int sysctl_tipc_rmem[3] __read_mostly; /* min/default/max */
static int __net_init tipc_init_net(struct net *net)
diff --git a/net/tipc/core.h b/net/tipc/core.h
index a1845fb27d80..5cc5398be722 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -74,7 +74,7 @@ struct tipc_monitor;
#define MAX_BEARERS 3
#define TIPC_DEF_MON_THRESHOLD 32
-extern int tipc_net_id __read_mostly;
+extern unsigned int tipc_net_id __read_mostly;
extern int sysctl_tipc_rmem[3] __read_mostly;
extern int sysctl_tipc_named_timeout __read_mostly;
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index 6b109a808d4c..02462d67d191 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -169,7 +169,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
/* Send response, if necessary */
if (respond && (mtyp == DSC_REQ_MSG)) {
- rskb = tipc_buf_acquire(MAX_H_SIZE);
+ rskb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC);
if (!rskb)
return;
tipc_disc_init_msg(net, rskb, DSC_RESP_MSG, bearer);
@@ -278,7 +278,7 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b,
req = kmalloc(sizeof(*req), GFP_ATOMIC);
if (!req)
return -ENOMEM;
- req->buf = tipc_buf_acquire(MAX_H_SIZE);
+ req->buf = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC);
if (!req->buf) {
kfree(req);
return -ENOMEM;
diff --git a/net/tipc/link.c b/net/tipc/link.c
index bda89bf9f4ff..ddd2dd6f77aa 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -515,6 +515,10 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
if (link_is_bc_sndlink(l))
l->state = LINK_ESTABLISHED;
+ /* Disable replicast if even a single peer doesn't support it */
+ if (link_is_bc_rcvlink(l) && !(peer_caps & TIPC_BCAST_RCAST))
+ tipc_bcast_disable_rcast(net);
+
return true;
}
@@ -776,60 +780,47 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
/**
* link_schedule_user - schedule a message sender for wakeup after congestion
- * @link: congested link
- * @list: message that was attempted sent
+ * @l: congested link
+ * @hdr: header of message that is being sent
* Create pseudo msg to send back to user when congestion abates
- * Does not consume buffer list
*/
-static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
+static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr)
{
- struct tipc_msg *msg = buf_msg(skb_peek(list));
- int imp = msg_importance(msg);
- u32 oport = msg_origport(msg);
- u32 addr = tipc_own_addr(link->net);
+ u32 dnode = tipc_own_addr(l->net);
+ u32 dport = msg_origport(hdr);
struct sk_buff *skb;
- /* This really cannot happen... */
- if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) {
- pr_warn("%s<%s>, send queue full", link_rst_msg, link->name);
- return -ENOBUFS;
- }
- /* Non-blocking sender: */
- if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending)
- return -ELINKCONG;
-
/* Create and schedule wakeup pseudo message */
skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0,
- addr, addr, oport, 0, 0);
+ dnode, l->addr, dport, 0, 0);
if (!skb)
return -ENOBUFS;
- TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list);
- TIPC_SKB_CB(skb)->chain_imp = imp;
- skb_queue_tail(&link->wakeupq, skb);
- link->stats.link_congs++;
+ msg_set_dest_droppable(buf_msg(skb), true);
+ TIPC_SKB_CB(skb)->chain_imp = msg_importance(hdr);
+ skb_queue_tail(&l->wakeupq, skb);
+ l->stats.link_congs++;
return -ELINKCONG;
}
/**
* link_prepare_wakeup - prepare users for wakeup after congestion
- * @link: congested link
- * Move a number of waiting users, as permitted by available space in
- * the send queue, from link wait queue to node wait queue for wakeup
+ * @l: congested link
+ * Wake up a number of waiting users, as permitted by available space
+ * in the send queue
*/
void link_prepare_wakeup(struct tipc_link *l)
{
- int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,};
- int imp, lim;
struct sk_buff *skb, *tmp;
+ int imp, i = 0;
skb_queue_walk_safe(&l->wakeupq, skb, tmp) {
imp = TIPC_SKB_CB(skb)->chain_imp;
- lim = l->backlog[imp].limit;
- pnd[imp] += TIPC_SKB_CB(skb)->chain_sz;
- if ((pnd[imp] + l->backlog[imp].len) >= lim)
+ if (l->backlog[imp].len < l->backlog[imp].limit) {
+ skb_unlink(skb, &l->wakeupq);
+ skb_queue_tail(l->inputq, skb);
+ } else if (i++ > 10) {
break;
- skb_unlink(skb, &l->wakeupq);
- skb_queue_tail(l->inputq, skb);
+ }
}
}
@@ -869,8 +860,7 @@ void tipc_link_reset(struct tipc_link *l)
* @list: chain of buffers containing message
* @xmitq: returned list of packets to be sent by caller
*
- * Consumes the buffer chain, except when returning -ELINKCONG,
- * since the caller then may want to make more send attempts.
+ * Consumes the buffer chain.
* Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS
* Messages at TIPC_SYSTEM_IMPORTANCE are always accepted
*/
@@ -879,7 +869,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
{
struct tipc_msg *hdr = buf_msg(skb_peek(list));
unsigned int maxwin = l->window;
- unsigned int i, imp = msg_importance(hdr);
+ int imp = msg_importance(hdr);
unsigned int mtu = l->mtu;
u16 ack = l->rcv_nxt - 1;
u16 seqno = l->snd_nxt;
@@ -888,19 +878,22 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
struct sk_buff_head *backlogq = &l->backlogq;
struct sk_buff *skb, *_skb, *bskb;
int pkt_cnt = skb_queue_len(list);
+ int rc = 0;
- /* Match msg importance against this and all higher backlog limits: */
- if (!skb_queue_empty(backlogq)) {
- for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) {
- if (unlikely(l->backlog[i].len >= l->backlog[i].limit))
- return link_schedule_user(l, list);
- }
- }
if (unlikely(msg_size(hdr) > mtu)) {
skb_queue_purge(list);
return -EMSGSIZE;
}
+ /* Allow oversubscription of one data msg per source at congestion */
+ if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) {
+ if (imp == TIPC_SYSTEM_IMPORTANCE) {
+ pr_warn("%s<%s>, link overflow", link_rst_msg, l->name);
+ return -ENOBUFS;
+ }
+ rc = link_schedule_user(l, hdr);
+ }
+
if (pkt_cnt > 1) {
l->stats.sent_fragmented++;
l->stats.sent_fragments += pkt_cnt;
@@ -946,7 +939,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
skb_queue_splice_tail_init(list, backlogq);
}
l->snd_nxt = seqno;
- return 0;
+ return rc;
}
void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq)
@@ -1043,11 +1036,17 @@ int tipc_link_retrans(struct tipc_link *l, u16 from, u16 to,
static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb,
struct sk_buff_head *inputq)
{
- switch (msg_user(buf_msg(skb))) {
+ struct tipc_msg *hdr = buf_msg(skb);
+
+ switch (msg_user(hdr)) {
case TIPC_LOW_IMPORTANCE:
case TIPC_MEDIUM_IMPORTANCE:
case TIPC_HIGH_IMPORTANCE:
case TIPC_CRITICAL_IMPORTANCE:
+ if (unlikely(msg_type(hdr) == TIPC_MCAST_MSG)) {
+ skb_queue_tail(l->bc_rcvlink->inputq, skb);
+ return true;
+ }
case CONN_MANAGER:
skb_queue_tail(inputq, skb);
return true;
@@ -1395,7 +1394,7 @@ tnl:
msg_set_seqno(hdr, seqno++);
pktlen = msg_size(hdr);
msg_set_size(&tnlhdr, pktlen + INT_H_SIZE);
- tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE);
+ tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE, GFP_ATOMIC);
if (!tnlskb) {
pr_warn("%sunable to send packet\n", link_co_err);
return;
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 17201aa8423d..312ef7de57d7 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -58,12 +58,12 @@ static unsigned int align(unsigned int i)
* NOTE: Headroom is reserved to allow prepending of a data link header.
* There may also be unrequested tailroom present at the buffer's end.
*/
-struct sk_buff *tipc_buf_acquire(u32 size)
+struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp)
{
struct sk_buff *skb;
unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u;
- skb = alloc_skb_fclone(buf_size, GFP_ATOMIC);
+ skb = alloc_skb_fclone(buf_size, gfp);
if (skb) {
skb_reserve(skb, BUF_HEADROOM);
skb_put(skb, size);
@@ -95,7 +95,7 @@ struct sk_buff *tipc_msg_create(uint user, uint type,
struct tipc_msg *msg;
struct sk_buff *buf;
- buf = tipc_buf_acquire(hdr_sz + data_sz);
+ buf = tipc_buf_acquire(hdr_sz + data_sz, GFP_ATOMIC);
if (unlikely(!buf))
return NULL;
@@ -261,14 +261,14 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
/* No fragmentation needed? */
if (likely(msz <= pktmax)) {
- skb = tipc_buf_acquire(msz);
+ skb = tipc_buf_acquire(msz, GFP_KERNEL);
if (unlikely(!skb))
return -ENOMEM;
skb_orphan(skb);
__skb_queue_tail(list, skb);
skb_copy_to_linear_data(skb, mhdr, mhsz);
pktpos = skb->data + mhsz;
- if (copy_from_iter(pktpos, dsz, &m->msg_iter) == dsz)
+ if (copy_from_iter_full(pktpos, dsz, &m->msg_iter))
return dsz;
rc = -EFAULT;
goto error;
@@ -282,7 +282,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
msg_set_importance(&pkthdr, msg_importance(mhdr));
/* Prepare first fragment */
- skb = tipc_buf_acquire(pktmax);
+ skb = tipc_buf_acquire(pktmax, GFP_KERNEL);
if (!skb)
return -ENOMEM;
skb_orphan(skb);
@@ -299,7 +299,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
if (drem < pktrem)
pktrem = drem;
- if (copy_from_iter(pktpos, pktrem, &m->msg_iter) != pktrem) {
+ if (!copy_from_iter_full(pktpos, pktrem, &m->msg_iter)) {
rc = -EFAULT;
goto error;
}
@@ -313,7 +313,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
pktsz = drem + INT_H_SIZE;
else
pktsz = pktmax;
- skb = tipc_buf_acquire(pktsz);
+ skb = tipc_buf_acquire(pktsz, GFP_KERNEL);
if (!skb) {
rc = -ENOMEM;
goto error;
@@ -448,7 +448,7 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg,
if (msz > (max / 2))
return false;
- _skb = tipc_buf_acquire(max);
+ _skb = tipc_buf_acquire(max, GFP_ATOMIC);
if (!_skb)
return false;
@@ -496,7 +496,7 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err)
/* Never return SHORT header; expand by replacing buffer if necessary */
if (msg_short(hdr)) {
- *skb = tipc_buf_acquire(BASIC_H_SIZE + dlen);
+ *skb = tipc_buf_acquire(BASIC_H_SIZE + dlen, GFP_ATOMIC);
if (!*skb)
goto exit;
memcpy((*skb)->data + BASIC_H_SIZE, msg_data(hdr), dlen);
@@ -607,6 +607,23 @@ error:
return false;
}
+bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
+ struct sk_buff_head *cpy)
+{
+ struct sk_buff *skb, *_skb;
+
+ skb_queue_walk(msg, skb) {
+ _skb = pskb_copy(skb, GFP_ATOMIC);
+ if (!_skb) {
+ __skb_queue_purge(cpy);
+ return false;
+ }
+ msg_set_destnode(buf_msg(_skb), dst);
+ __skb_queue_tail(cpy, _skb);
+ }
+ return true;
+}
+
/* tipc_skb_queue_sorted(); sort pkt into list according to sequence number
* @list: list to be appended to
* @seqno: sequence number of buffer to add
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 50a739860d37..c843fd2bc48d 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -95,11 +95,9 @@ struct plist;
#define TIPC_MEDIA_INFO_OFFSET 5
struct tipc_skb_cb {
- void *handle;
+ u32 bytes_read;
struct sk_buff *tail;
bool validated;
- bool wakeup_pending;
- u16 chain_sz;
u16 chain_imp;
u16 ackers;
};
@@ -633,14 +631,11 @@ static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id)
static inline u32 msg_link_selector(struct tipc_msg *m)
{
+ if (msg_user(m) == MSG_FRAGMENTER)
+ m = (void *)msg_data(m);
return msg_bits(m, 4, 0, 1);
}
-static inline void msg_set_link_selector(struct tipc_msg *m, u32 n)
-{
- msg_set_bits(m, 4, 0, 1, n);
-}
-
/*
* Word 5
*/
@@ -820,7 +815,7 @@ static inline bool msg_is_reset(struct tipc_msg *hdr)
return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG);
}
-struct sk_buff *tipc_buf_acquire(u32 size);
+struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp);
bool tipc_msg_validate(struct sk_buff *skb);
bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err);
void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type,
@@ -837,6 +832,8 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
int offset, int dsz, int mtu, struct sk_buff_head *list);
bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err);
bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq);
+bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
+ struct sk_buff_head *cpy);
void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno,
struct sk_buff *skb);
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index c1cfd92de17a..23f8899e0f8c 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -69,7 +69,7 @@ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size,
u32 dest)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size);
+ struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC);
struct tipc_msg *msg;
if (buf != NULL) {
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index e190460fe0d3..9be6592e4a6f 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -608,7 +608,7 @@ not_found:
* Returns non-zero if any off-node ports overlap
*/
int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
- u32 limit, struct tipc_plist *dports)
+ u32 limit, struct list_head *dports)
{
struct name_seq *seq;
struct sub_seq *sseq;
@@ -633,7 +633,7 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
info = sseq->info;
list_for_each_entry(publ, &info->node_list, node_list) {
if (publ->scope <= limit)
- tipc_plist_push(dports, publ->ref);
+ u32_push(dports, publ->ref);
}
if (info->cluster_list_size != info->node_list_size)
@@ -645,6 +645,39 @@ exit:
return res;
}
+/* tipc_nametbl_lookup_dst_nodes - find broadcast destination nodes
+ * - Creates list of nodes that overlap the given multicast address
+ * - Determines if any node local ports overlap
+ */
+void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
+ u32 upper, u32 domain,
+ struct tipc_nlist *nodes)
+{
+ struct sub_seq *sseq, *stop;
+ struct publication *publ;
+ struct name_info *info;
+ struct name_seq *seq;
+
+ rcu_read_lock();
+ seq = nametbl_find_seq(net, type);
+ if (!seq)
+ goto exit;
+
+ spin_lock_bh(&seq->lock);
+ sseq = seq->sseqs + nameseq_locate_subseq(seq, lower);
+ stop = seq->sseqs + seq->first_free;
+ for (; sseq->lower <= upper && sseq != stop; sseq++) {
+ info = sseq->info;
+ list_for_each_entry(publ, &info->zone_list, zone_list) {
+ if (tipc_in_scope(domain, publ->node))
+ tipc_nlist_add(nodes, publ->node);
+ }
+ }
+ spin_unlock_bh(&seq->lock);
+exit:
+ rcu_read_unlock();
+}
+
/*
* tipc_nametbl_publish - add name publication to network name tables
*/
@@ -1022,40 +1055,79 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-void tipc_plist_push(struct tipc_plist *pl, u32 port)
+bool u32_find(struct list_head *l, u32 value)
{
- struct tipc_plist *nl;
+ struct u32_item *item;
- if (likely(!pl->port)) {
- pl->port = port;
- return;
+ list_for_each_entry(item, l, list) {
+ if (item->value == value)
+ return true;
}
- if (pl->port == port)
- return;
- list_for_each_entry(nl, &pl->list, list) {
- if (nl->port == port)
- return;
+ return false;
+}
+
+bool u32_push(struct list_head *l, u32 value)
+{
+ struct u32_item *item;
+
+ list_for_each_entry(item, l, list) {
+ if (item->value == value)
+ return false;
+ }
+ item = kmalloc(sizeof(*item), GFP_ATOMIC);
+ if (unlikely(!item))
+ return false;
+
+ item->value = value;
+ list_add(&item->list, l);
+ return true;
+}
+
+u32 u32_pop(struct list_head *l)
+{
+ struct u32_item *item;
+ u32 value = 0;
+
+ if (list_empty(l))
+ return 0;
+ item = list_first_entry(l, typeof(*item), list);
+ value = item->value;
+ list_del(&item->list);
+ kfree(item);
+ return value;
+}
+
+bool u32_del(struct list_head *l, u32 value)
+{
+ struct u32_item *item, *tmp;
+
+ list_for_each_entry_safe(item, tmp, l, list) {
+ if (item->value != value)
+ continue;
+ list_del(&item->list);
+ kfree(item);
+ return true;
}
- nl = kmalloc(sizeof(*nl), GFP_ATOMIC);
- if (nl) {
- nl->port = port;
- list_add(&nl->list, &pl->list);
+ return false;
+}
+
+void u32_list_purge(struct list_head *l)
+{
+ struct u32_item *item, *tmp;
+
+ list_for_each_entry_safe(item, tmp, l, list) {
+ list_del(&item->list);
+ kfree(item);
}
}
-u32 tipc_plist_pop(struct tipc_plist *pl)
+int u32_list_len(struct list_head *l)
{
- struct tipc_plist *nl;
- u32 port = 0;
+ struct u32_item *item;
+ int i = 0;
- if (likely(list_empty(&pl->list))) {
- port = pl->port;
- pl->port = 0;
- return port;
+ list_for_each_entry(item, l, list) {
+ i++;
}
- nl = list_first_entry(&pl->list, typeof(*nl), list);
- port = nl->port;
- list_del(&nl->list);
- kfree(nl);
- return port;
+ return i;
}
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 1524a73830f7..6ebdeb1d84a5 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -39,6 +39,7 @@
struct tipc_subscription;
struct tipc_plist;
+struct tipc_nlist;
/*
* TIPC name types reserved for internal TIPC use (both current and planned)
@@ -99,7 +100,10 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb);
u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node);
int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
- u32 limit, struct tipc_plist *dports);
+ u32 limit, struct list_head *dports);
+void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
+ u32 upper, u32 domain,
+ struct tipc_nlist *nodes);
struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
u32 upper, u32 scope, u32 port_ref,
u32 key);
@@ -116,18 +120,16 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s);
int tipc_nametbl_init(struct net *net);
void tipc_nametbl_stop(struct net *net);
-struct tipc_plist {
+struct u32_item {
struct list_head list;
- u32 port;
+ u32 value;
};
-static inline void tipc_plist_init(struct tipc_plist *pl)
-{
- INIT_LIST_HEAD(&pl->list);
- pl->port = 0;
-}
-
-void tipc_plist_push(struct tipc_plist *pl, u32 port);
-u32 tipc_plist_pop(struct tipc_plist *pl);
+bool u32_push(struct list_head *l, u32 value);
+u32 u32_pop(struct list_head *l);
+bool u32_find(struct list_head *l, u32 value);
+bool u32_del(struct list_head *l, u32 value);
+void u32_list_purge(struct list_head *l);
+int u32_list_len(struct list_head *l);
#endif
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 28bf4feeb81c..ab8a2d5d1e32 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -110,6 +110,10 @@ int tipc_net_start(struct net *net, u32 addr)
char addr_string[16];
tn->own_addr = addr;
+
+ /* Ensure that the new address is visible before we reinit. */
+ smp_mb();
+
tipc_named_reinit(net);
tipc_sk_reinit(net);
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 3200059d14b2..26ca8dd64ded 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -135,15 +135,6 @@ const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = {
/* Users of the legacy API (tipc-config) can't handle that we add operations,
* so we have a separate genl handling for the new API.
*/
-struct genl_family tipc_genl_family = {
- .id = GENL_ID_GENERATE,
- .name = TIPC_GENL_V2_NAME,
- .version = TIPC_GENL_V2_VERSION,
- .hdrsize = 0,
- .maxattr = TIPC_NLA_MAX,
- .netnsok = true,
-};
-
static const struct genl_ops tipc_genl_v2_ops[] = {
{
.cmd = TIPC_NL_BEARER_DISABLE,
@@ -258,23 +249,33 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
#endif
};
+struct genl_family tipc_genl_family __ro_after_init = {
+ .name = TIPC_GENL_V2_NAME,
+ .version = TIPC_GENL_V2_VERSION,
+ .hdrsize = 0,
+ .maxattr = TIPC_NLA_MAX,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = tipc_genl_v2_ops,
+ .n_ops = ARRAY_SIZE(tipc_genl_v2_ops),
+};
+
int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr)
{
u32 maxattr = tipc_genl_family.maxattr;
- *attr = tipc_genl_family.attrbuf;
+ *attr = genl_family_attrbuf(&tipc_genl_family);
if (!*attr)
return -EOPNOTSUPP;
return nlmsg_parse(nlh, GENL_HDRLEN, *attr, maxattr, tipc_nl_policy);
}
-int tipc_netlink_start(void)
+int __init tipc_netlink_start(void)
{
int res;
- res = genl_register_family_with_ops(&tipc_genl_family,
- tipc_genl_v2_ops);
+ res = genl_register_family(&tipc_genl_family);
if (res) {
pr_err("Failed to register netlink interface\n");
return res;
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 1fd464764765..e1ae8a8a2b8e 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -1215,15 +1215,6 @@ send:
return err;
}
-static struct genl_family tipc_genl_compat_family = {
- .id = GENL_ID_GENERATE,
- .name = TIPC_GENL_NAME,
- .version = TIPC_GENL_VERSION,
- .hdrsize = TIPC_GENL_HDRLEN,
- .maxattr = 0,
- .netnsok = true,
-};
-
static struct genl_ops tipc_genl_compat_ops[] = {
{
.cmd = TIPC_GENL_CMD,
@@ -1231,12 +1222,22 @@ static struct genl_ops tipc_genl_compat_ops[] = {
},
};
-int tipc_netlink_compat_start(void)
+static struct genl_family tipc_genl_compat_family __ro_after_init = {
+ .name = TIPC_GENL_NAME,
+ .version = TIPC_GENL_VERSION,
+ .hdrsize = TIPC_GENL_HDRLEN,
+ .maxattr = 0,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = tipc_genl_compat_ops,
+ .n_ops = ARRAY_SIZE(tipc_genl_compat_ops),
+};
+
+int __init tipc_netlink_compat_start(void)
{
int res;
- res = genl_register_family_with_ops(&tipc_genl_compat_family,
- tipc_genl_compat_ops);
+ res = genl_register_family(&tipc_genl_compat_family);
if (res) {
pr_err("Failed to register legacy compat interface\n");
return res;
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 9d2f4c2b08ab..4512e83652b1 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -263,6 +263,11 @@ static void tipc_node_write_lock(struct tipc_node *n)
write_lock_bh(&n->lock);
}
+static void tipc_node_write_unlock_fast(struct tipc_node *n)
+{
+ write_unlock_bh(&n->lock);
+}
+
static void tipc_node_write_unlock(struct tipc_node *n)
{
struct net *net = n->net;
@@ -417,7 +422,7 @@ void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr)
}
tipc_node_write_lock(n);
list_add_tail(subscr, &n->publ_list);
- tipc_node_write_unlock(n);
+ tipc_node_write_unlock_fast(n);
tipc_node_put(n);
}
@@ -435,7 +440,7 @@ void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr)
}
tipc_node_write_lock(n);
list_del_init(subscr);
- tipc_node_write_unlock(n);
+ tipc_node_write_unlock_fast(n);
tipc_node_put(n);
}
@@ -1167,7 +1172,7 @@ msg_full:
* @list: chain of buffers containing message
* @dnode: address of destination node
* @selector: a number used for deterministic link selection
- * Consumes the buffer chain, except when returning -ELINKCONG
+ * Consumes the buffer chain.
* Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF
*/
int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
@@ -1206,10 +1211,10 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
spin_unlock_bh(&le->lock);
tipc_node_read_unlock(n);
- if (likely(rc == 0))
- tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
- else if (rc == -ENOBUFS)
+ if (unlikely(rc == -ENOBUFS))
tipc_node_link_down(n, bearer_id, false);
+ else
+ tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
tipc_node_put(n);
@@ -1221,20 +1226,15 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
* messages, which will not be rejected
* The only exception is datagram messages rerouted after secondary
* lookup, which are rare and safe to dispose of anyway.
- * TODO: Return real return value, and let callers use
- * tipc_wait_for_sendpkt() where applicable
*/
int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
u32 selector)
{
struct sk_buff_head head;
- int rc;
skb_queue_head_init(&head);
__skb_queue_tail(&head, skb);
- rc = tipc_node_xmit(net, &head, dnode, selector);
- if (rc == -ELINKCONG)
- kfree_skb(skb);
+ tipc_node_xmit(net, &head, dnode, selector);
return 0;
}
@@ -1262,6 +1262,19 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb)
kfree_skb(skb);
}
+static void tipc_node_mcast_rcv(struct tipc_node *n)
+{
+ struct tipc_bclink_entry *be = &n->bc_entry;
+
+ /* 'arrvq' is under inputq2's lock protection */
+ spin_lock_bh(&be->inputq2.lock);
+ spin_lock_bh(&be->inputq1.lock);
+ skb_queue_splice_tail_init(&be->inputq1, &be->arrvq);
+ spin_unlock_bh(&be->inputq1.lock);
+ spin_unlock_bh(&be->inputq2.lock);
+ tipc_sk_mcast_rcv(n->net, &be->arrvq, &be->inputq2);
+}
+
static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr,
int bearer_id, struct sk_buff_head *xmitq)
{
@@ -1335,15 +1348,8 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id
if (!skb_queue_empty(&xmitq))
tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
- /* Deliver. 'arrvq' is under inputq2's lock protection */
- if (!skb_queue_empty(&be->inputq1)) {
- spin_lock_bh(&be->inputq2.lock);
- spin_lock_bh(&be->inputq1.lock);
- skb_queue_splice_tail_init(&be->inputq1, &be->arrvq);
- spin_unlock_bh(&be->inputq1.lock);
- spin_unlock_bh(&be->inputq2.lock);
- tipc_sk_mcast_rcv(net, &be->arrvq, &be->inputq2);
- }
+ if (!skb_queue_empty(&be->inputq1))
+ tipc_node_mcast_rcv(n);
if (rc & TIPC_LINK_DOWN_EVT) {
/* Reception reassembly failure => reset all links to peer */
@@ -1499,19 +1505,21 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
{
struct sk_buff_head xmitq;
struct tipc_node *n;
- struct tipc_msg *hdr = buf_msg(skb);
- int usr = msg_user(hdr);
+ struct tipc_msg *hdr;
int bearer_id = b->identity;
struct tipc_link_entry *le;
- u16 bc_ack = msg_bcast_ack(hdr);
u32 self = tipc_own_addr(net);
- int rc = 0;
+ int usr, rc = 0;
+ u16 bc_ack;
__skb_queue_head_init(&xmitq);
- /* Ensure message is well-formed */
+ /* Ensure message is well-formed before touching the header */
if (unlikely(!tipc_msg_validate(skb)))
goto discard;
+ hdr = buf_msg(skb);
+ usr = msg_user(hdr);
+ bc_ack = msg_bcast_ack(hdr);
/* Handle arrival of discovery or broadcast packet */
if (unlikely(msg_non_seq(hdr))) {
@@ -1570,6 +1578,9 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
if (unlikely(!skb_queue_empty(&n->bc_entry.namedq)))
tipc_named_rcv(net, &n->bc_entry.namedq);
+ if (unlikely(!skb_queue_empty(&n->bc_entry.inputq1)))
+ tipc_node_mcast_rcv(n);
+
if (!skb_queue_empty(&le->inputq))
tipc_sk_rcv(net, &le->inputq);
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 39ef54c1f2ad..898c22916984 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -47,11 +47,13 @@
enum {
TIPC_BCAST_SYNCH = (1 << 1),
TIPC_BCAST_STATE_NACK = (1 << 2),
- TIPC_BLOCK_FLOWCTL = (1 << 3)
+ TIPC_BLOCK_FLOWCTL = (1 << 3),
+ TIPC_BCAST_RCAST = (1 << 4)
};
#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \
TIPC_BCAST_STATE_NACK | \
+ TIPC_BCAST_RCAST | \
TIPC_BLOCK_FLOWCTL)
#define INVALID_BEARER_ID -1
diff --git a/net/tipc/server.c b/net/tipc/server.c
index 215849ce453d..3cd6402e812c 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -86,12 +86,12 @@ struct outqueue_entry {
static void tipc_recv_work(struct work_struct *work);
static void tipc_send_work(struct work_struct *work);
static void tipc_clean_outqueues(struct tipc_conn *con);
-static void tipc_sock_release(struct tipc_conn *con);
static void tipc_conn_kref_release(struct kref *kref)
{
struct tipc_conn *con = container_of(kref, struct tipc_conn, kref);
- struct sockaddr_tipc *saddr = con->server->saddr;
+ struct tipc_server *s = con->server;
+ struct sockaddr_tipc *saddr = s->saddr;
struct socket *sock = con->sock;
struct sock *sk;
@@ -103,9 +103,13 @@ static void tipc_conn_kref_release(struct kref *kref)
}
saddr->scope = -TIPC_NODE_SCOPE;
kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr));
- tipc_sock_release(con);
sock_release(sock);
con->sock = NULL;
+
+ spin_lock_bh(&s->idr_lock);
+ idr_remove(&s->conn_idr, con->conid);
+ s->idr_in_use--;
+ spin_unlock_bh(&s->idr_lock);
}
tipc_clean_outqueues(con);
@@ -128,8 +132,10 @@ static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid)
spin_lock_bh(&s->idr_lock);
con = idr_find(&s->conn_idr, conid);
- if (con)
+ if (con && test_bit(CF_CONNECTED, &con->flags))
conn_get(con);
+ else
+ con = NULL;
spin_unlock_bh(&s->idr_lock);
return con;
}
@@ -186,26 +192,15 @@ static void tipc_unregister_callbacks(struct tipc_conn *con)
write_unlock_bh(&sk->sk_callback_lock);
}
-static void tipc_sock_release(struct tipc_conn *con)
-{
- struct tipc_server *s = con->server;
-
- if (con->conid)
- s->tipc_conn_release(con->conid, con->usr_data);
-
- tipc_unregister_callbacks(con);
-}
-
static void tipc_close_conn(struct tipc_conn *con)
{
struct tipc_server *s = con->server;
if (test_and_clear_bit(CF_CONNECTED, &con->flags)) {
+ tipc_unregister_callbacks(con);
- spin_lock_bh(&s->idr_lock);
- idr_remove(&s->conn_idr, con->conid);
- s->idr_in_use--;
- spin_unlock_bh(&s->idr_lock);
+ if (con->conid)
+ s->tipc_conn_release(con->conid, con->usr_data);
/* We shouldn't flush pending works as we may be in the
* thread. In fact the races with pending rx/tx work structs
@@ -458,6 +453,11 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid,
if (!con)
return -EINVAL;
+ if (!test_bit(CF_CONNECTED, &con->flags)) {
+ conn_put(con);
+ return 0;
+ }
+
e = tipc_alloc_entry(data, len);
if (!e) {
conn_put(con);
@@ -471,12 +471,8 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid,
list_add_tail(&e->list, &con->outqueue);
spin_unlock_bh(&con->outqueue_lock);
- if (test_bit(CF_CONNECTED, &con->flags)) {
- if (!queue_work(s->send_wq, &con->swork))
- conn_put(con);
- } else {
+ if (!queue_work(s->send_wq, &con->swork))
conn_put(con);
- }
return 0;
}
@@ -500,7 +496,7 @@ static void tipc_send_to_sock(struct tipc_conn *con)
int ret;
spin_lock_bh(&con->outqueue_lock);
- while (1) {
+ while (test_bit(CF_CONNECTED, &con->flags)) {
e = list_entry(con->outqueue.next, struct outqueue_entry,
list);
if ((struct list_head *) e == &con->outqueue)
@@ -623,14 +619,12 @@ int tipc_server_start(struct tipc_server *s)
void tipc_server_stop(struct tipc_server *s)
{
struct tipc_conn *con;
- int total = 0;
int id;
spin_lock_bh(&s->idr_lock);
- for (id = 0; total < s->idr_in_use; id++) {
+ for (id = 0; s->idr_in_use; id++) {
con = idr_find(&s->conn_idr, id);
if (con) {
- total++;
spin_unlock_bh(&s->idr_lock);
tipc_close_conn(con);
spin_lock_bh(&s->idr_lock);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 41f013888f07..7130e73bd42c 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -35,6 +35,8 @@
*/
#include <linux/rhashtable.h>
+#include <linux/sched/signal.h>
+
#include "core.h"
#include "name_table.h"
#include "node.h"
@@ -44,65 +46,67 @@
#include "bcast.h"
#include "netlink.h"
-#define SS_LISTENING -1 /* socket is listening */
-#define SS_READY -2 /* socket is connectionless */
-
#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */
#define CONN_PROBING_INTERVAL msecs_to_jiffies(3600000) /* [ms] => 1 h */
#define TIPC_FWD_MSG 1
-#define TIPC_CONN_OK 0
-#define TIPC_CONN_PROBING 1
#define TIPC_MAX_PORT 0xffffffff
#define TIPC_MIN_PORT 1
+enum {
+ TIPC_LISTEN = TCP_LISTEN,
+ TIPC_ESTABLISHED = TCP_ESTABLISHED,
+ TIPC_OPEN = TCP_CLOSE,
+ TIPC_DISCONNECTING = TCP_CLOSE_WAIT,
+ TIPC_CONNECTING = TCP_SYN_SENT,
+};
+
/**
* struct tipc_sock - TIPC socket structure
* @sk: socket - interacts with 'port' and with user via the socket API
- * @connected: non-zero if port is currently connected to a peer port
* @conn_type: TIPC type used when connection was established
* @conn_instance: TIPC instance used when connection was established
* @published: non-zero if port has one or more associated names
* @max_pkt: maximum packet size "hint" used when building messages sent by port
* @portid: unique port identity in TIPC socket hash table
* @phdr: preformatted message header used when sending messages
- * @port_list: adjacent ports in TIPC's global list of ports
+ * #cong_links: list of congested links
* @publications: list of publications for port
+ * @blocking_link: address of the congested link we are currently sleeping on
* @pub_count: total # of publications port has made during its lifetime
* @probing_state:
- * @probing_intv:
* @conn_timeout: the time we can wait for an unresponded setup request
* @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue
- * @link_cong: non-zero if owner must sleep because of link congestion
+ * @cong_link_cnt: number of congested links
* @sent_unacked: # messages sent by socket, and not yet acked by peer
* @rcv_unacked: # messages read by user, but not yet acked back to peer
- * @remote: 'connected' peer for dgram/rdm
+ * @peer: 'connected' peer for dgram/rdm
* @node: hash table node
+ * @mc_method: cookie for use between socket and broadcast layer
* @rcu: rcu struct for tipc_sock
*/
struct tipc_sock {
struct sock sk;
- int connected;
u32 conn_type;
u32 conn_instance;
int published;
u32 max_pkt;
u32 portid;
struct tipc_msg phdr;
- struct list_head sock_list;
+ struct list_head cong_links;
struct list_head publications;
u32 pub_count;
- u32 probing_state;
- unsigned long probing_intv;
uint conn_timeout;
atomic_t dupl_rcvcnt;
- bool link_cong;
+ bool probe_unacked;
+ u16 cong_link_cnt;
u16 snt_unacked;
u16 snd_win;
u16 peer_caps;
u16 rcv_unacked;
u16 rcv_win;
- struct sockaddr_tipc remote;
+ struct sockaddr_tipc peer;
struct rhash_head node;
+ struct tipc_mc_method mc_method;
struct rcu_head rcu;
};
@@ -111,8 +115,8 @@ static void tipc_data_ready(struct sock *sk);
static void tipc_write_space(struct sock *sk);
static void tipc_sock_destruct(struct sock *sk);
static int tipc_release(struct socket *sock);
-static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags);
-static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p);
+static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags,
+ bool kern);
static void tipc_sk_timeout(unsigned long data);
static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
struct tipc_name_seq const *seq);
@@ -121,8 +125,7 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid);
static int tipc_sk_insert(struct tipc_sock *tsk);
static void tipc_sk_remove(struct tipc_sock *tsk);
-static int __tipc_send_stream(struct socket *sock, struct msghdr *m,
- size_t dsz);
+static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz);
static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz);
static const struct proto_ops packet_ops;
@@ -248,6 +251,21 @@ static void tsk_rej_rx_queue(struct sock *sk)
tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT);
}
+static bool tipc_sk_connected(struct sock *sk)
+{
+ return sk->sk_state == TIPC_ESTABLISHED;
+}
+
+/* tipc_sk_type_connectionless - check if the socket is datagram socket
+ * @sk: socket
+ *
+ * Returns true if connection less, false otherwise
+ */
+static bool tipc_sk_type_connectionless(struct sock *sk)
+{
+ return sk->sk_type == SOCK_RDM || sk->sk_type == SOCK_DGRAM;
+}
+
/* tsk_peer_msg - verify if message was sent by connected port's peer
*
* Handles cases where the node's network address has changed from
@@ -255,12 +273,13 @@ static void tsk_rej_rx_queue(struct sock *sk)
*/
static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg)
{
- struct tipc_net *tn = net_generic(sock_net(&tsk->sk), tipc_net_id);
+ struct sock *sk = &tsk->sk;
+ struct tipc_net *tn = net_generic(sock_net(sk), tipc_net_id);
u32 peer_port = tsk_peer_port(tsk);
u32 orig_node;
u32 peer_node;
- if (unlikely(!tsk->connected))
+ if (unlikely(!tipc_sk_connected(sk)))
return false;
if (unlikely(msg_origport(msg) != peer_port))
@@ -281,6 +300,88 @@ static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg)
return false;
}
+/* tipc_set_sk_state - set the sk_state of the socket
+ * @sk: socket
+ *
+ * Caller must hold socket lock
+ *
+ * Returns 0 on success, errno otherwise
+ */
+static int tipc_set_sk_state(struct sock *sk, int state)
+{
+ int oldsk_state = sk->sk_state;
+ int res = -EINVAL;
+
+ switch (state) {
+ case TIPC_OPEN:
+ res = 0;
+ break;
+ case TIPC_LISTEN:
+ case TIPC_CONNECTING:
+ if (oldsk_state == TIPC_OPEN)
+ res = 0;
+ break;
+ case TIPC_ESTABLISHED:
+ if (oldsk_state == TIPC_CONNECTING ||
+ oldsk_state == TIPC_OPEN)
+ res = 0;
+ break;
+ case TIPC_DISCONNECTING:
+ if (oldsk_state == TIPC_CONNECTING ||
+ oldsk_state == TIPC_ESTABLISHED)
+ res = 0;
+ break;
+ }
+
+ if (!res)
+ sk->sk_state = state;
+
+ return res;
+}
+
+static int tipc_sk_sock_err(struct socket *sock, long *timeout)
+{
+ struct sock *sk = sock->sk;
+ int err = sock_error(sk);
+ int typ = sock->type;
+
+ if (err)
+ return err;
+ if (typ == SOCK_STREAM || typ == SOCK_SEQPACKET) {
+ if (sk->sk_state == TIPC_DISCONNECTING)
+ return -EPIPE;
+ else if (!tipc_sk_connected(sk))
+ return -ENOTCONN;
+ }
+ if (!*timeout)
+ return -EAGAIN;
+ if (signal_pending(current))
+ return sock_intr_errno(*timeout);
+
+ return 0;
+}
+
+#define tipc_wait_for_cond(sock_, timeout_, condition_) \
+({ \
+ int rc_ = 0; \
+ int done_ = 0; \
+ \
+ while (!(condition_) && !done_) { \
+ struct sock *sk_ = sock->sk; \
+ DEFINE_WAIT_FUNC(wait_, woken_wake_function); \
+ \
+ rc_ = tipc_sk_sock_err(sock_, timeout_); \
+ if (rc_) \
+ break; \
+ prepare_to_wait(sk_sleep(sk_), &wait_, \
+ TASK_INTERRUPTIBLE); \
+ done_ = sk_wait_event(sk_, timeout_, \
+ (condition_), &wait_); \
+ remove_wait_queue(sk_sleep(sk_), &wait_); \
+ } \
+ rc_; \
+})
+
/**
* tipc_sk_create - create a TIPC socket
* @net: network namespace (must be default network)
@@ -298,7 +399,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
{
struct tipc_net *tn;
const struct proto_ops *ops;
- socket_state state;
struct sock *sk;
struct tipc_sock *tsk;
struct tipc_msg *msg;
@@ -310,16 +410,13 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
switch (sock->type) {
case SOCK_STREAM:
ops = &stream_ops;
- state = SS_UNCONNECTED;
break;
case SOCK_SEQPACKET:
ops = &packet_ops;
- state = SS_UNCONNECTED;
break;
case SOCK_DGRAM:
case SOCK_RDM:
ops = &msg_ops;
- state = SS_READY;
break;
default:
return -EPROTOTYPE;
@@ -333,21 +430,28 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
tsk = tipc_sk(sk);
tsk->max_pkt = MAX_PKT_DEFAULT;
INIT_LIST_HEAD(&tsk->publications);
+ INIT_LIST_HEAD(&tsk->cong_links);
msg = &tsk->phdr;
tn = net_generic(sock_net(sk), tipc_net_id);
- tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG,
- NAMED_H_SIZE, 0);
/* Finish initializing socket data structures */
sock->ops = ops;
- sock->state = state;
sock_init_data(sock, sk);
+ tipc_set_sk_state(sk, TIPC_OPEN);
if (tipc_sk_insert(tsk)) {
pr_warn("Socket create failed; port number exhausted\n");
return -EINVAL;
}
+
+ /* Ensure tsk is visible before we read own_addr. */
+ smp_mb();
+
+ tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG,
+ NAMED_H_SIZE, 0);
+
msg_set_origport(msg, tsk->portid);
setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk);
+ sk->sk_shutdown = 0;
sk->sk_backlog_rcv = tipc_backlog_rcv;
sk->sk_rcvbuf = sysctl_tipc_rmem[1];
sk->sk_data_ready = tipc_data_ready;
@@ -360,11 +464,12 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
tsk->snd_win = tsk_adv_blocks(RCVBUF_MIN);
tsk->rcv_win = tsk->snd_win;
- if (sock->state == SS_READY) {
+ if (tipc_sk_type_connectionless(sk)) {
tsk_set_unreturnable(tsk, true);
if (sock->type == SOCK_DGRAM)
tsk_set_unreliable(tsk, true);
}
+
return 0;
}
@@ -375,6 +480,51 @@ static void tipc_sk_callback(struct rcu_head *head)
sock_put(&tsk->sk);
}
+/* Caller should hold socket lock for the socket. */
+static void __tipc_shutdown(struct socket *sock, int error)
+{
+ struct sock *sk = sock->sk;
+ struct tipc_sock *tsk = tipc_sk(sk);
+ struct net *net = sock_net(sk);
+ long timeout = CONN_TIMEOUT_DEFAULT;
+ u32 dnode = tsk_peer_node(tsk);
+ struct sk_buff *skb;
+
+ /* Avoid that hi-prio shutdown msgs bypass msgs in link wakeup queue */
+ tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt &&
+ !tsk_conn_cong(tsk)));
+
+ /* Reject all unreceived messages, except on an active connection
+ * (which disconnects locally & sends a 'FIN+' to peer).
+ */
+ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ if (TIPC_SKB_CB(skb)->bytes_read) {
+ kfree_skb(skb);
+ continue;
+ }
+ if (!tipc_sk_type_connectionless(sk) &&
+ sk->sk_state != TIPC_DISCONNECTING) {
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
+ tipc_node_remove_conn(net, dnode, tsk->portid);
+ }
+ tipc_sk_respond(sk, skb, error);
+ }
+
+ if (tipc_sk_type_connectionless(sk))
+ return;
+
+ if (sk->sk_state != TIPC_DISCONNECTING) {
+ skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
+ TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode,
+ tsk_own_node(tsk), tsk_peer_port(tsk),
+ tsk->portid, error);
+ if (skb)
+ tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
+ tipc_node_remove_conn(net, dnode, tsk->portid);
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
+ }
+}
+
/**
* tipc_release - destroy a TIPC socket
* @sock: socket to destroy
@@ -394,10 +544,7 @@ static void tipc_sk_callback(struct rcu_head *head)
static int tipc_release(struct socket *sock)
{
struct sock *sk = sock->sk;
- struct net *net;
struct tipc_sock *tsk;
- struct sk_buff *skb;
- u32 dnode;
/*
* Exit if socket isn't fully initialized (occurs when a failed accept()
@@ -406,49 +553,19 @@ static int tipc_release(struct socket *sock)
if (sk == NULL)
return 0;
- net = sock_net(sk);
tsk = tipc_sk(sk);
lock_sock(sk);
- /*
- * Reject all unreceived messages, except on an active connection
- * (which disconnects locally & sends a 'FIN+' to peer)
- */
- dnode = tsk_peer_node(tsk);
- while (sock->state != SS_DISCONNECTING) {
- skb = __skb_dequeue(&sk->sk_receive_queue);
- if (skb == NULL)
- break;
- if (TIPC_SKB_CB(skb)->handle != NULL)
- kfree_skb(skb);
- else {
- if ((sock->state == SS_CONNECTING) ||
- (sock->state == SS_CONNECTED)) {
- sock->state = SS_DISCONNECTING;
- tsk->connected = 0;
- tipc_node_remove_conn(net, dnode, tsk->portid);
- }
- tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT);
- }
- }
-
+ __tipc_shutdown(sock, TIPC_ERR_NO_PORT);
+ sk->sk_shutdown = SHUTDOWN_MASK;
tipc_sk_withdraw(tsk, 0, NULL);
sk_stop_timer(sk, &sk->sk_timer);
tipc_sk_remove(tsk);
- if (tsk->connected) {
- skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
- TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode,
- tsk_own_node(tsk), tsk_peer_port(tsk),
- tsk->portid, TIPC_ERR_NO_PORT);
- if (skb)
- tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
- tipc_node_remove_conn(net, dnode, tsk->portid);
- }
/* Reject any messages that accumulated in backlog queue */
- sock->state = SS_DISCONNECTING;
release_sock(sk);
-
+ u32_list_purge(&tsk->cong_links);
+ tsk->cong_link_cnt = 0;
call_rcu(&tsk->rcu, tipc_sk_callback);
sock->sk = NULL;
@@ -532,13 +649,14 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
int *uaddr_len, int peer)
{
struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr;
- struct tipc_sock *tsk = tipc_sk(sock->sk);
+ struct sock *sk = sock->sk;
+ struct tipc_sock *tsk = tipc_sk(sk);
struct tipc_net *tn = net_generic(sock_net(sock->sk), tipc_net_id);
memset(addr, 0, sizeof(*addr));
if (peer) {
- if ((sock->state != SS_CONNECTED) &&
- ((peer != 2) || (sock->state != SS_DISCONNECTING)))
+ if ((!tipc_sk_connected(sk)) &&
+ ((peer != 2) || (sk->sk_state != TIPC_DISCONNECTING)))
return -ENOTCONN;
addr->addr.id.ref = tsk_peer_port(tsk);
addr->addr.id.node = tsk_peer_node(tsk);
@@ -570,28 +688,6 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
* exits. TCP and other protocols seem to rely on higher level poll routines
* to handle any preventable race conditions, so TIPC will do the same ...
*
- * TIPC sets the returned events as follows:
- *
- * socket state flags set
- * ------------ ---------
- * unconnected no read flags
- * POLLOUT if port is not congested
- *
- * connecting POLLIN/POLLRDNORM if ACK/NACK in rx queue
- * no write flags
- *
- * connected POLLIN/POLLRDNORM if data in rx queue
- * POLLOUT if port is not congested
- *
- * disconnecting POLLIN/POLLRDNORM/POLLHUP
- * no write flags
- *
- * listening POLLIN if SYN in rx queue
- * no write flags
- *
- * ready POLLIN/POLLRDNORM if data in rx queue
- * [connectionless] POLLOUT (since port cannot be congested)
- *
* IMPORTANT: The fact that a read or write operation is indicated does NOT
* imply that the operation will succeed, merely that it should be performed
* and will not block.
@@ -605,22 +701,29 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
sock_poll_wait(file, sk_sleep(sk), wait);
- switch ((int)sock->state) {
- case SS_UNCONNECTED:
- if (!tsk->link_cong)
- mask |= POLLOUT;
- break;
- case SS_READY:
- case SS_CONNECTED:
- if (!tsk->link_cong && !tsk_conn_cong(tsk))
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ mask |= POLLHUP;
+
+ switch (sk->sk_state) {
+ case TIPC_ESTABLISHED:
+ if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))
mask |= POLLOUT;
/* fall thru' */
- case SS_CONNECTING:
- case SS_LISTENING:
+ case TIPC_LISTEN:
+ case TIPC_CONNECTING:
if (!skb_queue_empty(&sk->sk_receive_queue))
mask |= (POLLIN | POLLRDNORM);
break;
- case SS_DISCONNECTING:
+ case TIPC_OPEN:
+ if (!tsk->cong_link_cnt)
+ mask |= POLLOUT;
+ if (tipc_sk_type_connectionless(sk) &&
+ (!skb_queue_empty(&sk->sk_receive_queue)))
+ mask |= (POLLIN | POLLRDNORM);
+ break;
+ case TIPC_DISCONNECTING:
mask = (POLLIN | POLLRDNORM | POLLHUP);
break;
}
@@ -633,60 +736,60 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
* @sock: socket structure
* @seq: destination address
* @msg: message to send
- * @dsz: total length of message data
- * @timeo: timeout to wait for wakeup
+ * @dlen: length of data to send
+ * @timeout: timeout to wait for wakeup
*
* Called from function tipc_sendmsg(), which has done all sanity checks
* Returns the number of bytes sent on success, or errno
*/
static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq,
- struct msghdr *msg, size_t dsz, long timeo)
+ struct msghdr *msg, size_t dlen, long timeout)
{
struct sock *sk = sock->sk;
struct tipc_sock *tsk = tipc_sk(sk);
+ struct tipc_msg *hdr = &tsk->phdr;
struct net *net = sock_net(sk);
- struct tipc_msg *mhdr = &tsk->phdr;
- struct sk_buff_head pktchain;
- struct iov_iter save = msg->msg_iter;
- uint mtu;
+ int mtu = tipc_bcast_get_mtu(net);
+ struct tipc_mc_method *method = &tsk->mc_method;
+ u32 domain = addr_domain(net, TIPC_CLUSTER_SCOPE);
+ struct sk_buff_head pkts;
+ struct tipc_nlist dsts;
int rc;
- msg_set_type(mhdr, TIPC_MCAST_MSG);
- msg_set_lookup_scope(mhdr, TIPC_CLUSTER_SCOPE);
- msg_set_destport(mhdr, 0);
- msg_set_destnode(mhdr, 0);
- msg_set_nametype(mhdr, seq->type);
- msg_set_namelower(mhdr, seq->lower);
- msg_set_nameupper(mhdr, seq->upper);
- msg_set_hdr_sz(mhdr, MCAST_H_SIZE);
-
- skb_queue_head_init(&pktchain);
-
-new_mtu:
- mtu = tipc_bcast_get_mtu(net);
- rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, &pktchain);
- if (unlikely(rc < 0))
+ /* Block or return if any destination link is congested */
+ rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt);
+ if (unlikely(rc))
return rc;
- do {
- rc = tipc_bcast_xmit(net, &pktchain);
- if (likely(!rc))
- return dsz;
-
- if (rc == -ELINKCONG) {
- tsk->link_cong = 1;
- rc = tipc_wait_for_sndmsg(sock, &timeo);
- if (!rc)
- continue;
- }
- __skb_queue_purge(&pktchain);
- if (rc == -EMSGSIZE) {
- msg->msg_iter = save;
- goto new_mtu;
- }
- break;
- } while (1);
- return rc;
+ /* Lookup destination nodes */
+ tipc_nlist_init(&dsts, tipc_own_addr(net));
+ tipc_nametbl_lookup_dst_nodes(net, seq->type, seq->lower,
+ seq->upper, domain, &dsts);
+ if (!dsts.local && !dsts.remote)
+ return -EHOSTUNREACH;
+
+ /* Build message header */
+ msg_set_type(hdr, TIPC_MCAST_MSG);
+ msg_set_hdr_sz(hdr, MCAST_H_SIZE);
+ msg_set_lookup_scope(hdr, TIPC_CLUSTER_SCOPE);
+ msg_set_destport(hdr, 0);
+ msg_set_destnode(hdr, 0);
+ msg_set_nametype(hdr, seq->type);
+ msg_set_namelower(hdr, seq->lower);
+ msg_set_nameupper(hdr, seq->upper);
+
+ /* Build message as chain of buffers */
+ skb_queue_head_init(&pkts);
+ rc = tipc_msg_build(hdr, msg, 0, dlen, mtu, &pkts);
+
+ /* Send message if build was successful */
+ if (unlikely(rc == dlen))
+ rc = tipc_mcast_xmit(net, &pkts, method, &dsts,
+ &tsk->cong_link_cnt);
+
+ tipc_nlist_purge(&dsts);
+
+ return rc ? rc : dlen;
}
/**
@@ -700,7 +803,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
struct sk_buff_head *inputq)
{
struct tipc_msg *msg;
- struct tipc_plist dports;
+ struct list_head dports;
u32 portid;
u32 scope = TIPC_CLUSTER_SCOPE;
struct sk_buff_head tmpq;
@@ -708,7 +811,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
struct sk_buff *skb, *_skb;
__skb_queue_head_init(&tmpq);
- tipc_plist_init(&dports);
+ INIT_LIST_HEAD(&dports);
skb = tipc_skb_peek(arrvq, &inputq->lock);
for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) {
@@ -722,8 +825,8 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
tipc_nametbl_mc_translate(net,
msg_nametype(msg), msg_namelower(msg),
msg_nameupper(msg), scope, &dports);
- portid = tipc_plist_pop(&dports);
- for (; portid; portid = tipc_plist_pop(&dports)) {
+ portid = u32_pop(&dports);
+ for (; portid; portid = u32_pop(&dports)) {
_skb = __pskb_copy(skb, hsz, GFP_ATOMIC);
if (_skb) {
msg_set_destport(buf_msg(_skb), portid);
@@ -763,7 +866,7 @@ static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
if (!tsk_peer_msg(tsk, hdr))
goto exit;
- tsk->probing_state = TIPC_CONN_OK;
+ tsk->probe_unacked = false;
if (mtyp == CONN_PROBE) {
msg_set_type(hdr, CONN_PROBE_REPLY);
@@ -784,31 +887,6 @@ exit:
kfree_skb(skb);
}
-static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p)
-{
- struct sock *sk = sock->sk;
- struct tipc_sock *tsk = tipc_sk(sk);
- DEFINE_WAIT(wait);
- int done;
-
- do {
- int err = sock_error(sk);
- if (err)
- return err;
- if (sock->state == SS_DISCONNECTING)
- return -EPIPE;
- if (!*timeo_p)
- return -EAGAIN;
- if (signal_pending(current))
- return sock_intr_errno(*timeo_p);
-
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- done = sk_wait_event(sk, timeo_p, !tsk->link_cong);
- finish_wait(sk_sleep(sk), &wait);
- } while (!done);
- return 0;
-}
-
/**
* tipc_sendmsg - send message in connectionless manner
* @sock: socket structure
@@ -835,37 +913,41 @@ static int tipc_sendmsg(struct socket *sock,
return ret;
}
-static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
+static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
{
- DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
struct sock *sk = sock->sk;
- struct tipc_sock *tsk = tipc_sk(sk);
struct net *net = sock_net(sk);
- struct tipc_msg *mhdr = &tsk->phdr;
- u32 dnode, dport;
- struct sk_buff_head pktchain;
- struct sk_buff *skb;
+ struct tipc_sock *tsk = tipc_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
+ long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
+ struct list_head *clinks = &tsk->cong_links;
+ bool syn = !tipc_sk_type_connectionless(sk);
+ struct tipc_msg *hdr = &tsk->phdr;
struct tipc_name_seq *seq;
- struct iov_iter save;
- u32 mtu;
- long timeo;
- int rc;
+ struct sk_buff_head pkts;
+ u32 type, inst, domain;
+ u32 dnode, dport;
+ int mtu, rc;
- if (dsz > TIPC_MAX_USER_MSG_SIZE)
+ if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE))
return -EMSGSIZE;
+
if (unlikely(!dest)) {
- if (tsk->connected && sock->state == SS_READY)
- dest = &tsk->remote;
- else
+ dest = &tsk->peer;
+ if (!syn || dest->family != AF_TIPC)
return -EDESTADDRREQ;
- } else if (unlikely(m->msg_namelen < sizeof(*dest)) ||
- dest->family != AF_TIPC) {
- return -EINVAL;
}
- if (unlikely(sock->state != SS_READY)) {
- if (sock->state == SS_LISTENING)
+
+ if (unlikely(m->msg_namelen < sizeof(*dest)))
+ return -EINVAL;
+
+ if (unlikely(dest->family != AF_TIPC))
+ return -EINVAL;
+
+ if (unlikely(syn)) {
+ if (sk->sk_state == TIPC_LISTEN)
return -EPIPE;
- if (sock->state != SS_UNCONNECTED)
+ if (sk->sk_state != TIPC_OPEN)
return -EISCONN;
if (tsk->published)
return -EOPNOTSUPP;
@@ -874,102 +956,62 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
tsk->conn_instance = dest->addr.name.name.instance;
}
}
- seq = &dest->addr.nameseq;
- timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
- if (dest->addrtype == TIPC_ADDR_MCAST) {
- return tipc_sendmcast(sock, seq, m, dsz, timeo);
- } else if (dest->addrtype == TIPC_ADDR_NAME) {
- u32 type = dest->addr.name.name.type;
- u32 inst = dest->addr.name.name.instance;
- u32 domain = dest->addr.name.domain;
+ seq = &dest->addr.nameseq;
+ if (dest->addrtype == TIPC_ADDR_MCAST)
+ return tipc_sendmcast(sock, seq, m, dlen, timeout);
+ if (dest->addrtype == TIPC_ADDR_NAME) {
+ type = dest->addr.name.name.type;
+ inst = dest->addr.name.name.instance;
+ domain = dest->addr.name.domain;
dnode = domain;
- msg_set_type(mhdr, TIPC_NAMED_MSG);
- msg_set_hdr_sz(mhdr, NAMED_H_SIZE);
- msg_set_nametype(mhdr, type);
- msg_set_nameinst(mhdr, inst);
- msg_set_lookup_scope(mhdr, tipc_addr_scope(domain));
+ msg_set_type(hdr, TIPC_NAMED_MSG);
+ msg_set_hdr_sz(hdr, NAMED_H_SIZE);
+ msg_set_nametype(hdr, type);
+ msg_set_nameinst(hdr, inst);
+ msg_set_lookup_scope(hdr, tipc_addr_scope(domain));
dport = tipc_nametbl_translate(net, type, inst, &dnode);
- msg_set_destnode(mhdr, dnode);
- msg_set_destport(mhdr, dport);
+ msg_set_destnode(hdr, dnode);
+ msg_set_destport(hdr, dport);
if (unlikely(!dport && !dnode))
return -EHOSTUNREACH;
+
} else if (dest->addrtype == TIPC_ADDR_ID) {
dnode = dest->addr.id.node;
- msg_set_type(mhdr, TIPC_DIRECT_MSG);
- msg_set_lookup_scope(mhdr, 0);
- msg_set_destnode(mhdr, dnode);
- msg_set_destport(mhdr, dest->addr.id.ref);
- msg_set_hdr_sz(mhdr, BASIC_H_SIZE);
+ msg_set_type(hdr, TIPC_DIRECT_MSG);
+ msg_set_lookup_scope(hdr, 0);
+ msg_set_destnode(hdr, dnode);
+ msg_set_destport(hdr, dest->addr.id.ref);
+ msg_set_hdr_sz(hdr, BASIC_H_SIZE);
}
- skb_queue_head_init(&pktchain);
- save = m->msg_iter;
-new_mtu:
- mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
- rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, &pktchain);
- if (rc < 0)
+ /* Block or return if destination link is congested */
+ rc = tipc_wait_for_cond(sock, &timeout, !u32_find(clinks, dnode));
+ if (unlikely(rc))
return rc;
- do {
- skb = skb_peek(&pktchain);
- TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong;
- rc = tipc_node_xmit(net, &pktchain, dnode, tsk->portid);
- if (likely(!rc)) {
- if (sock->state != SS_READY)
- sock->state = SS_CONNECTING;
- return dsz;
- }
- if (rc == -ELINKCONG) {
- tsk->link_cong = 1;
- rc = tipc_wait_for_sndmsg(sock, &timeo);
- if (!rc)
- continue;
- }
- __skb_queue_purge(&pktchain);
- if (rc == -EMSGSIZE) {
- m->msg_iter = save;
- goto new_mtu;
- }
- break;
- } while (1);
-
- return rc;
-}
+ skb_queue_head_init(&pkts);
+ mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
+ rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
+ if (unlikely(rc != dlen))
+ return rc;
-static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p)
-{
- struct sock *sk = sock->sk;
- struct tipc_sock *tsk = tipc_sk(sk);
- DEFINE_WAIT(wait);
- int done;
+ rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
+ if (unlikely(rc == -ELINKCONG)) {
+ u32_push(clinks, dnode);
+ tsk->cong_link_cnt++;
+ rc = 0;
+ }
- do {
- int err = sock_error(sk);
- if (err)
- return err;
- if (sock->state == SS_DISCONNECTING)
- return -EPIPE;
- else if (sock->state != SS_CONNECTED)
- return -ENOTCONN;
- if (!*timeo_p)
- return -EAGAIN;
- if (signal_pending(current))
- return sock_intr_errno(*timeo_p);
+ if (unlikely(syn && !rc))
+ tipc_set_sk_state(sk, TIPC_CONNECTING);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- done = sk_wait_event(sk, timeo_p,
- (!tsk->link_cong &&
- !tsk_conn_cong(tsk)) ||
- !tsk->connected);
- finish_wait(sk_sleep(sk), &wait);
- } while (!done);
- return 0;
+ return rc ? rc : dlen;
}
/**
- * tipc_send_stream - send stream-oriented data
+ * tipc_sendstream - send stream-oriented data
* @sock: socket structure
* @m: data to send
* @dsz: total length of data to be transmitted
@@ -979,91 +1021,69 @@ static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p)
* Returns the number of bytes sent on success (or partial success),
* or errno if no data sent
*/
-static int tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
+static int tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz)
{
struct sock *sk = sock->sk;
int ret;
lock_sock(sk);
- ret = __tipc_send_stream(sock, m, dsz);
+ ret = __tipc_sendstream(sock, m, dsz);
release_sock(sk);
return ret;
}
-static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
+static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
{
struct sock *sk = sock->sk;
- struct net *net = sock_net(sk);
- struct tipc_sock *tsk = tipc_sk(sk);
- struct tipc_msg *mhdr = &tsk->phdr;
- struct sk_buff_head pktchain;
DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
- u32 portid = tsk->portid;
- int rc = -EINVAL;
- long timeo;
- u32 dnode;
- uint mtu, send, sent = 0;
- struct iov_iter save;
- int hlen = MIN_H_SIZE;
-
- /* Handle implied connection establishment */
- if (unlikely(dest)) {
- rc = __tipc_sendmsg(sock, m, dsz);
- hlen = msg_hdr_sz(mhdr);
- if (dsz && (dsz == rc))
- tsk->snt_unacked = tsk_inc(tsk, dsz + hlen);
- return rc;
- }
- if (dsz > (uint)INT_MAX)
- return -EMSGSIZE;
+ long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
+ struct tipc_sock *tsk = tipc_sk(sk);
+ struct tipc_msg *hdr = &tsk->phdr;
+ struct net *net = sock_net(sk);
+ struct sk_buff_head pkts;
+ u32 dnode = tsk_peer_node(tsk);
+ int send, sent = 0;
+ int rc = 0;
- if (unlikely(sock->state != SS_CONNECTED)) {
- if (sock->state == SS_DISCONNECTING)
- return -EPIPE;
- else
- return -ENOTCONN;
- }
+ skb_queue_head_init(&pkts);
- timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
- dnode = tsk_peer_node(tsk);
- skb_queue_head_init(&pktchain);
+ if (unlikely(dlen > INT_MAX))
+ return -EMSGSIZE;
-next:
- save = m->msg_iter;
- mtu = tsk->max_pkt;
- send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE);
- rc = tipc_msg_build(mhdr, m, sent, send, mtu, &pktchain);
- if (unlikely(rc < 0))
+ /* Handle implicit connection setup */
+ if (unlikely(dest)) {
+ rc = __tipc_sendmsg(sock, m, dlen);
+ if (dlen && (dlen == rc))
+ tsk->snt_unacked = tsk_inc(tsk, dlen + msg_hdr_sz(hdr));
return rc;
+ }
do {
- if (likely(!tsk_conn_cong(tsk))) {
- rc = tipc_node_xmit(net, &pktchain, dnode, portid);
- if (likely(!rc)) {
- tsk->snt_unacked += tsk_inc(tsk, send + hlen);
- sent += send;
- if (sent == dsz)
- return dsz;
- goto next;
- }
- if (rc == -EMSGSIZE) {
- __skb_queue_purge(&pktchain);
- tsk->max_pkt = tipc_node_get_mtu(net, dnode,
- portid);
- m->msg_iter = save;
- goto next;
- }
- if (rc != -ELINKCONG)
- break;
+ rc = tipc_wait_for_cond(sock, &timeout,
+ (!tsk->cong_link_cnt &&
+ !tsk_conn_cong(tsk) &&
+ tipc_sk_connected(sk)));
+ if (unlikely(rc))
+ break;
- tsk->link_cong = 1;
+ send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE);
+ rc = tipc_msg_build(hdr, m, sent, send, tsk->max_pkt, &pkts);
+ if (unlikely(rc != send))
+ break;
+
+ rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
+ if (unlikely(rc == -ELINKCONG)) {
+ tsk->cong_link_cnt = 1;
+ rc = 0;
+ }
+ if (likely(!rc)) {
+ tsk->snt_unacked += tsk_inc(tsk, send + MIN_H_SIZE);
+ sent += send;
}
- rc = tipc_wait_for_sndpkt(sock, &timeo);
- } while (!rc);
+ } while (sent < dlen && !rc);
- __skb_queue_purge(&pktchain);
- return sent ? sent : rc;
+ return rc ? rc : sent;
}
/**
@@ -1081,7 +1101,7 @@ static int tipc_send_packet(struct socket *sock, struct msghdr *m, size_t dsz)
if (dsz > TIPC_MAX_USER_MSG_SIZE)
return -EMSGSIZE;
- return tipc_send_stream(sock, m, dsz);
+ return tipc_sendstream(sock, m, dsz);
}
/* tipc_sk_finish_conn - complete the setup of a connection
@@ -1099,10 +1119,8 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port,
msg_set_lookup_scope(msg, 0);
msg_set_hdr_sz(msg, SHORT_H_SIZE);
- tsk->probing_intv = CONN_PROBING_INTERVAL;
- tsk->probing_state = TIPC_CONN_OK;
- tsk->connected = 1;
- sk_reset_timer(sk, &sk->sk_timer, jiffies + tsk->probing_intv);
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTERVAL);
+ tipc_set_sk_state(sk, TIPC_ESTABLISHED);
tipc_node_add_conn(net, peer_node, tsk->portid, peer_port);
tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid);
tsk->peer_caps = tipc_node_get_capabilities(net, peer_node);
@@ -1210,13 +1228,14 @@ static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg,
static void tipc_sk_send_ack(struct tipc_sock *tsk)
{
- struct net *net = sock_net(&tsk->sk);
+ struct sock *sk = &tsk->sk;
+ struct net *net = sock_net(sk);
struct sk_buff *skb = NULL;
struct tipc_msg *msg;
u32 peer_port = tsk_peer_port(tsk);
u32 dnode = tsk_peer_node(tsk);
- if (!tsk->connected)
+ if (!tipc_sk_connected(sk))
return;
skb = tipc_msg_create(CONN_MANAGER, CONN_ACK, INT_H_SIZE, 0,
dnode, tsk_own_node(tsk), peer_port,
@@ -1245,7 +1264,7 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop)
for (;;) {
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
if (timeo && skb_queue_empty(&sk->sk_receive_queue)) {
- if (sock->state == SS_DISCONNECTING) {
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
err = -ENOTCONN;
break;
}
@@ -1286,6 +1305,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buf_len,
struct tipc_sock *tsk = tipc_sk(sk);
struct sk_buff *buf;
struct tipc_msg *msg;
+ bool is_connectionless = tipc_sk_type_connectionless(sk);
long timeo;
unsigned int sz;
u32 err;
@@ -1297,7 +1317,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buf_len,
lock_sock(sk);
- if (unlikely(sock->state == SS_UNCONNECTED)) {
+ if (!is_connectionless && unlikely(sk->sk_state == TIPC_OPEN)) {
res = -ENOTCONN;
goto exit;
}
@@ -1342,8 +1362,8 @@ restart:
goto exit;
res = sz;
} else {
- if ((sock->state == SS_READY) ||
- ((err == TIPC_CONN_SHUTDOWN) || m->msg_control))
+ if (is_connectionless || err == TIPC_CONN_SHUTDOWN ||
+ m->msg_control)
res = 0;
else
res = -ECONNRESET;
@@ -1352,7 +1372,7 @@ restart:
if (unlikely(flags & MSG_PEEK))
goto exit;
- if (likely(sock->state != SS_READY)) {
+ if (likely(!is_connectionless)) {
tsk->rcv_unacked += tsk_inc(tsk, hlen + sz);
if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4)))
tipc_sk_send_ack(tsk);
@@ -1383,7 +1403,7 @@ static int tipc_recv_stream(struct socket *sock, struct msghdr *m,
struct tipc_msg *msg;
long timeo;
unsigned int sz;
- int sz_to_copy, target, needed;
+ int target;
int sz_copied = 0;
u32 err;
int res = 0, hlen;
@@ -1394,7 +1414,7 @@ static int tipc_recv_stream(struct socket *sock, struct msghdr *m,
lock_sock(sk);
- if (unlikely(sock->state == SS_UNCONNECTED)) {
+ if (unlikely(sk->sk_state == TIPC_OPEN)) {
res = -ENOTCONN;
goto exit;
}
@@ -1431,11 +1451,13 @@ restart:
/* Capture message data (if valid) & compute return value (always) */
if (!err) {
- u32 offset = (u32)(unsigned long)(TIPC_SKB_CB(buf)->handle);
+ u32 offset = TIPC_SKB_CB(buf)->bytes_read;
+ u32 needed;
+ int sz_to_copy;
sz -= offset;
needed = (buf_len - sz_copied);
- sz_to_copy = (sz <= needed) ? sz : needed;
+ sz_to_copy = min(sz, needed);
res = skb_copy_datagram_msg(buf, hlen + offset, m, sz_to_copy);
if (res)
@@ -1445,8 +1467,8 @@ restart:
if (sz_to_copy < sz) {
if (!(flags & MSG_PEEK))
- TIPC_SKB_CB(buf)->handle =
- (void *)(unsigned long)(offset + sz_to_copy);
+ TIPC_SKB_CB(buf)->bytes_read =
+ offset + sz_to_copy;
goto exit;
}
} else {
@@ -1528,49 +1550,31 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
{
struct sock *sk = &tsk->sk;
struct net *net = sock_net(sk);
- struct socket *sock = sk->sk_socket;
struct tipc_msg *hdr = buf_msg(skb);
if (unlikely(msg_mcast(hdr)))
return false;
- switch ((int)sock->state) {
- case SS_CONNECTED:
-
- /* Accept only connection-based messages sent by peer */
- if (unlikely(!tsk_peer_msg(tsk, hdr)))
- return false;
-
- if (unlikely(msg_errcode(hdr))) {
- sock->state = SS_DISCONNECTING;
- tsk->connected = 0;
- /* Let timer expire on it's own */
- tipc_node_remove_conn(net, tsk_peer_node(tsk),
- tsk->portid);
- }
- return true;
-
- case SS_CONNECTING:
-
+ switch (sk->sk_state) {
+ case TIPC_CONNECTING:
/* Accept only ACK or NACK message */
if (unlikely(!msg_connected(hdr)))
return false;
if (unlikely(msg_errcode(hdr))) {
- sock->state = SS_DISCONNECTING;
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
sk->sk_err = ECONNREFUSED;
return true;
}
if (unlikely(!msg_isdata(hdr))) {
- sock->state = SS_DISCONNECTING;
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
sk->sk_err = EINVAL;
return true;
}
tipc_sk_finish_conn(tsk, msg_origport(hdr), msg_orignode(hdr));
msg_set_importance(&tsk->phdr, msg_importance(hdr));
- sock->state = SS_CONNECTED;
/* If 'ACK+' message, add to socket receive queue */
if (msg_data_sz(hdr))
@@ -1584,18 +1588,31 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
msg_set_dest_droppable(hdr, 1);
return false;
- case SS_LISTENING:
- case SS_UNCONNECTED:
-
+ case TIPC_OPEN:
+ case TIPC_DISCONNECTING:
+ break;
+ case TIPC_LISTEN:
/* Accept only SYN message */
if (!msg_connected(hdr) && !(msg_errcode(hdr)))
return true;
break;
- case SS_DISCONNECTING:
- break;
+ case TIPC_ESTABLISHED:
+ /* Accept only connection-based messages sent by peer */
+ if (unlikely(!tsk_peer_msg(tsk, hdr)))
+ return false;
+
+ if (unlikely(msg_errcode(hdr))) {
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
+ /* Let timer expire on it's own */
+ tipc_node_remove_conn(net, tsk_peer_node(tsk),
+ tsk->portid);
+ sk->sk_state_change(sk);
+ }
+ return true;
default:
- pr_err("Unknown socket state %u\n", sock->state);
+ pr_err("Unknown sk_state %u\n", sk->sk_state);
}
+
return false;
}
@@ -1646,12 +1663,12 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb)
static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
struct sk_buff_head *xmitq)
{
- struct socket *sock = sk->sk_socket;
struct tipc_sock *tsk = tipc_sk(sk);
struct tipc_msg *hdr = buf_msg(skb);
unsigned int limit = rcvbuf_limit(sk, skb);
int err = TIPC_OK;
int usr = msg_user(hdr);
+ u32 onode;
if (unlikely(msg_user(hdr) == CONN_MANAGER)) {
tipc_sk_proto_rcv(tsk, skb, xmitq);
@@ -1659,8 +1676,10 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
}
if (unlikely(usr == SOCK_WAKEUP)) {
+ onode = msg_orignode(hdr);
kfree_skb(skb);
- tsk->link_cong = 0;
+ u32_del(&tsk->cong_links, onode);
+ tsk->cong_link_cnt--;
sk->sk_write_space(sk);
return false;
}
@@ -1672,7 +1691,7 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
}
/* Reject if wrong message type for current socket state */
- if (unlikely(sock->state == SS_READY)) {
+ if (tipc_sk_type_connectionless(sk)) {
if (msg_connected(hdr)) {
err = TIPC_ERR_NO_PORT;
goto reject;
@@ -1689,7 +1708,7 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
}
/* Enqueue message */
- TIPC_SKB_CB(skb)->handle = NULL;
+ TIPC_SKB_CB(skb)->bytes_read = 0;
__skb_queue_tail(&sk->sk_receive_queue, skb);
skb_set_owner_r(skb, sk);
@@ -1839,8 +1858,8 @@ xmit:
static int tipc_wait_for_connect(struct socket *sock, long *timeo_p)
{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct sock *sk = sock->sk;
- DEFINE_WAIT(wait);
int done;
do {
@@ -1852,9 +1871,10 @@ static int tipc_wait_for_connect(struct socket *sock, long *timeo_p)
if (signal_pending(current))
return sock_intr_errno(*timeo_p);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- done = sk_wait_event(sk, timeo_p, sock->state != SS_CONNECTING);
- finish_wait(sk_sleep(sk), &wait);
+ add_wait_queue(sk_sleep(sk), &wait);
+ done = sk_wait_event(sk, timeo_p,
+ sk->sk_state != TIPC_CONNECTING, &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
} while (!done);
return 0;
}
@@ -1876,21 +1896,19 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest,
struct sockaddr_tipc *dst = (struct sockaddr_tipc *)dest;
struct msghdr m = {NULL,};
long timeout = (flags & O_NONBLOCK) ? 0 : tsk->conn_timeout;
- socket_state previous;
+ int previous;
int res = 0;
lock_sock(sk);
/* DGRAM/RDM connect(), just save the destaddr */
- if (sock->state == SS_READY) {
+ if (tipc_sk_type_connectionless(sk)) {
if (dst->family == AF_UNSPEC) {
- memset(&tsk->remote, 0, sizeof(struct sockaddr_tipc));
- tsk->connected = 0;
+ memset(&tsk->peer, 0, sizeof(struct sockaddr_tipc));
} else if (destlen != sizeof(struct sockaddr_tipc)) {
res = -EINVAL;
} else {
- memcpy(&tsk->remote, dest, destlen);
- tsk->connected = 1;
+ memcpy(&tsk->peer, dest, destlen);
}
goto exit;
}
@@ -1906,9 +1924,10 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest,
goto exit;
}
- previous = sock->state;
- switch (sock->state) {
- case SS_UNCONNECTED:
+ previous = sk->sk_state;
+
+ switch (sk->sk_state) {
+ case TIPC_OPEN:
/* Send a 'SYN-' to destination */
m.msg_name = dest;
m.msg_namelen = destlen;
@@ -1923,27 +1942,29 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest,
if ((res < 0) && (res != -EWOULDBLOCK))
goto exit;
- /* Just entered SS_CONNECTING state; the only
+ /* Just entered TIPC_CONNECTING state; the only
* difference is that return value in non-blocking
* case is EINPROGRESS, rather than EALREADY.
*/
res = -EINPROGRESS;
- case SS_CONNECTING:
- if (previous == SS_CONNECTING)
- res = -EALREADY;
- if (!timeout)
+ /* fall thru' */
+ case TIPC_CONNECTING:
+ if (!timeout) {
+ if (previous == TIPC_CONNECTING)
+ res = -EALREADY;
goto exit;
+ }
timeout = msecs_to_jiffies(timeout);
/* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */
res = tipc_wait_for_connect(sock, &timeout);
break;
- case SS_CONNECTED:
+ case TIPC_ESTABLISHED:
res = -EISCONN;
break;
default:
res = -EINVAL;
- break;
}
+
exit:
release_sock(sk);
return res;
@@ -1962,15 +1983,9 @@ static int tipc_listen(struct socket *sock, int len)
int res;
lock_sock(sk);
-
- if (sock->state != SS_UNCONNECTED)
- res = -EINVAL;
- else {
- sock->state = SS_LISTENING;
- res = 0;
- }
-
+ res = tipc_set_sk_state(sk, TIPC_LISTEN);
release_sock(sk);
+
return res;
}
@@ -1996,9 +2011,6 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo)
err = 0;
if (!skb_queue_empty(&sk->sk_receive_queue))
break;
- err = -EINVAL;
- if (sock->state != SS_LISTENING)
- break;
err = -EAGAIN;
if (!timeo)
break;
@@ -2018,7 +2030,8 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo)
*
* Returns 0 on success, errno otherwise
*/
-static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
+static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags,
+ bool kern)
{
struct sock *new_sk, *sk = sock->sk;
struct sk_buff *buf;
@@ -2029,7 +2042,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
lock_sock(sk);
- if (sock->state != SS_LISTENING) {
+ if (sk->sk_state != TIPC_LISTEN) {
res = -EINVAL;
goto exit;
}
@@ -2040,7 +2053,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
buf = skb_peek(&sk->sk_receive_queue);
- res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, 1);
+ res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, kern);
if (res)
goto exit;
security_sk_clone(sock->sk, new_sock->sk);
@@ -2060,7 +2073,6 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
/* Connect new socket to it's peer */
tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg));
- new_sock->state = SS_CONNECTED;
tsk_set_importance(new_tsock, msg_importance(msg));
if (msg_named(msg)) {
@@ -2076,7 +2088,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
struct msghdr m = {NULL,};
tsk_advance_rx_queue(sk);
- __tipc_send_stream(new_sock, &m, 0);
+ __tipc_sendstream(new_sock, &m, 0);
} else {
__skb_dequeue(&sk->sk_receive_queue);
__skb_queue_head(&new_sk->sk_receive_queue, buf);
@@ -2100,13 +2112,6 @@ exit:
static int tipc_shutdown(struct socket *sock, int how)
{
struct sock *sk = sock->sk;
- struct net *net = sock_net(sk);
- struct tipc_sock *tsk = tipc_sk(sk);
- struct sk_buff *skb;
- u32 dnode = tsk_peer_node(tsk);
- u32 dport = tsk_peer_port(tsk);
- u32 onode = tipc_own_addr(net);
- u32 oport = tsk->portid;
int res;
if (how != SHUT_RDWR)
@@ -2114,45 +2119,17 @@ static int tipc_shutdown(struct socket *sock, int how)
lock_sock(sk);
- switch (sock->state) {
- case SS_CONNECTING:
- case SS_CONNECTED:
-
-restart:
- dnode = tsk_peer_node(tsk);
-
- /* Disconnect and send a 'FIN+' or 'FIN-' message to peer */
- skb = __skb_dequeue(&sk->sk_receive_queue);
- if (skb) {
- if (TIPC_SKB_CB(skb)->handle != NULL) {
- kfree_skb(skb);
- goto restart;
- }
- tipc_sk_respond(sk, skb, TIPC_CONN_SHUTDOWN);
- } else {
- skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
- TIPC_CONN_MSG, SHORT_H_SIZE,
- 0, dnode, onode, dport, oport,
- TIPC_CONN_SHUTDOWN);
- if (skb)
- tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
- }
- tsk->connected = 0;
- sock->state = SS_DISCONNECTING;
- tipc_node_remove_conn(net, dnode, tsk->portid);
- /* fall through */
-
- case SS_DISCONNECTING:
+ __tipc_shutdown(sock, TIPC_CONN_SHUTDOWN);
+ sk->sk_shutdown = SEND_SHUTDOWN;
+ if (sk->sk_state == TIPC_DISCONNECTING) {
/* Discard any unreceived messages */
__skb_queue_purge(&sk->sk_receive_queue);
/* Wake up anyone sleeping in poll */
sk->sk_state_change(sk);
res = 0;
- break;
-
- default:
+ } else {
res = -ENOTCONN;
}
@@ -2169,17 +2146,16 @@ static void tipc_sk_timeout(unsigned long data)
u32 own_node = tsk_own_node(tsk);
bh_lock_sock(sk);
- if (!tsk->connected) {
+ if (!tipc_sk_connected(sk)) {
bh_unlock_sock(sk);
goto exit;
}
peer_port = tsk_peer_port(tsk);
peer_node = tsk_peer_node(tsk);
- if (tsk->probing_state == TIPC_CONN_PROBING) {
+ if (tsk->probe_unacked) {
if (!sock_owned_by_user(sk)) {
- sk->sk_socket->state = SS_DISCONNECTING;
- tsk->connected = 0;
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk),
tsk_peer_port(tsk));
sk->sk_state_change(sk);
@@ -2188,13 +2164,15 @@ static void tipc_sk_timeout(unsigned long data)
sk_reset_timer(sk, &sk->sk_timer, (HZ / 20));
}
- } else {
- skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE,
- INT_H_SIZE, 0, peer_node, own_node,
- peer_port, tsk->portid, TIPC_OK);
- tsk->probing_state = TIPC_CONN_PROBING;
- sk_reset_timer(sk, &sk->sk_timer, jiffies + tsk->probing_intv);
+ bh_unlock_sock(sk);
+ goto exit;
}
+
+ skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE,
+ INT_H_SIZE, 0, peer_node, own_node,
+ peer_port, tsk->portid, TIPC_OK);
+ tsk->probe_unacked = true;
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTERVAL);
bh_unlock_sock(sk);
if (skb)
tipc_node_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid);
@@ -2205,11 +2183,12 @@ exit:
static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
struct tipc_name_seq const *seq)
{
- struct net *net = sock_net(&tsk->sk);
+ struct sock *sk = &tsk->sk;
+ struct net *net = sock_net(sk);
struct publication *publ;
u32 key;
- if (tsk->connected)
+ if (tipc_sk_connected(sk))
return -EINVAL;
key = tsk->portid + tsk->pub_count + 1;
if (key == tsk->portid)
@@ -2264,24 +2243,27 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
void tipc_sk_reinit(struct net *net)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
- const struct bucket_table *tbl;
- struct rhash_head *pos;
+ struct rhashtable_iter iter;
struct tipc_sock *tsk;
struct tipc_msg *msg;
- int i;
- rcu_read_lock();
- tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht);
- for (i = 0; i < tbl->size; i++) {
- rht_for_each_entry_rcu(tsk, pos, tbl, i, node) {
+ rhashtable_walk_enter(&tn->sk_rht, &iter);
+
+ do {
+ tsk = ERR_PTR(rhashtable_walk_start(&iter));
+ if (tsk)
+ continue;
+
+ while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) {
spin_lock_bh(&tsk->sk.sk_lock.slock);
msg = &tsk->phdr;
msg_set_prevnode(msg, tn->own_addr);
msg_set_orignode(msg, tn->own_addr);
spin_unlock_bh(&tsk->sk.sk_lock.slock);
}
- }
- rcu_read_unlock();
+
+ rhashtable_walk_stop(&iter);
+ } while (tsk == ERR_PTR(-EAGAIN));
}
static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid)
@@ -2377,18 +2359,29 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
{
struct sock *sk = sock->sk;
struct tipc_sock *tsk = tipc_sk(sk);
- u32 value;
- int res;
+ u32 value = 0;
+ int res = 0;
if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM))
return 0;
if (lvl != SOL_TIPC)
return -ENOPROTOOPT;
- if (ol < sizeof(value))
- return -EINVAL;
- res = get_user(value, (u32 __user *)ov);
- if (res)
- return res;
+
+ switch (opt) {
+ case TIPC_IMPORTANCE:
+ case TIPC_SRC_DROPPABLE:
+ case TIPC_DEST_DROPPABLE:
+ case TIPC_CONN_TIMEOUT:
+ if (ol < sizeof(value))
+ return -EINVAL;
+ res = get_user(value, (u32 __user *)ov);
+ if (res)
+ return res;
+ break;
+ default:
+ if (ov || ol)
+ return -EINVAL;
+ }
lock_sock(sk);
@@ -2407,7 +2400,14 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
break;
case TIPC_CONN_TIMEOUT:
tipc_sk(sk)->conn_timeout = value;
- /* no need to set "res", since already 0 at this point */
+ break;
+ case TIPC_MCAST_BROADCAST:
+ tsk->mc_method.rcast = false;
+ tsk->mc_method.mandatory = true;
+ break;
+ case TIPC_MCAST_REPLICAST:
+ tsk->mc_method.rcast = true;
+ tsk->mc_method.mandatory = true;
break;
default:
res = -EINVAL;
@@ -2570,7 +2570,7 @@ static const struct proto_ops stream_ops = {
.shutdown = tipc_shutdown,
.setsockopt = tipc_setsockopt,
.getsockopt = tipc_getsockopt,
- .sendmsg = tipc_send_stream,
+ .sendmsg = tipc_sendstream,
.recvmsg = tipc_recv_stream,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage
@@ -2667,6 +2667,7 @@ static int __tipc_nl_add_sk(struct sk_buff *skb, struct netlink_callback *cb,
struct nlattr *attrs;
struct net *net = sock_net(skb->sk);
struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct sock *sk = &tsk->sk;
hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&tipc_genl_family, NLM_F_MULTI, TIPC_NL_SOCK_GET);
@@ -2681,7 +2682,7 @@ static int __tipc_nl_add_sk(struct sk_buff *skb, struct netlink_callback *cb,
if (nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tn->own_addr))
goto attr_msg_cancel;
- if (tsk->connected) {
+ if (tipc_sk_connected(sk)) {
err = __tipc_nl_add_sk_con(skb, tsk);
if (err)
goto attr_msg_cancel;
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 0dd02244e21d..271cd66e4b3b 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -54,6 +54,8 @@ struct tipc_subscriber {
static void tipc_subscrp_delete(struct tipc_subscription *sub);
static void tipc_subscrb_put(struct tipc_subscriber *subscriber);
+static void tipc_subscrp_put(struct tipc_subscription *subscription);
+static void tipc_subscrp_get(struct tipc_subscription *subscription);
/**
* htohl - convert value to endianness used by destination
@@ -123,6 +125,7 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower,
{
struct tipc_name_seq seq;
+ tipc_subscrp_get(sub);
tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq);
if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper))
return;
@@ -132,6 +135,7 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower,
tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref,
node);
+ tipc_subscrp_put(sub);
}
static void tipc_subscrp_timeout(unsigned long data)
@@ -139,23 +143,20 @@ static void tipc_subscrp_timeout(unsigned long data)
struct tipc_subscription *sub = (struct tipc_subscription *)data;
struct tipc_subscriber *subscriber = sub->subscriber;
+ spin_lock_bh(&subscriber->lock);
+ tipc_nametbl_unsubscribe(sub);
+ spin_unlock_bh(&subscriber->lock);
+
/* Notify subscriber of timeout */
tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper,
TIPC_SUBSCR_TIMEOUT, 0, 0);
- spin_lock_bh(&subscriber->lock);
- tipc_subscrp_delete(sub);
- spin_unlock_bh(&subscriber->lock);
-
- tipc_subscrb_put(subscriber);
+ tipc_subscrp_put(sub);
}
static void tipc_subscrb_kref_release(struct kref *kref)
{
- struct tipc_subscriber *subcriber = container_of(kref,
- struct tipc_subscriber, kref);
-
- kfree(subcriber);
+ kfree(container_of(kref,struct tipc_subscriber, kref));
}
static void tipc_subscrb_put(struct tipc_subscriber *subscriber)
@@ -168,6 +169,59 @@ static void tipc_subscrb_get(struct tipc_subscriber *subscriber)
kref_get(&subscriber->kref);
}
+static void tipc_subscrp_kref_release(struct kref *kref)
+{
+ struct tipc_subscription *sub = container_of(kref,
+ struct tipc_subscription,
+ kref);
+ struct tipc_net *tn = net_generic(sub->net, tipc_net_id);
+ struct tipc_subscriber *subscriber = sub->subscriber;
+
+ spin_lock_bh(&subscriber->lock);
+ list_del(&sub->subscrp_list);
+ atomic_dec(&tn->subscription_count);
+ spin_unlock_bh(&subscriber->lock);
+ kfree(sub);
+ tipc_subscrb_put(subscriber);
+}
+
+static void tipc_subscrp_put(struct tipc_subscription *subscription)
+{
+ kref_put(&subscription->kref, tipc_subscrp_kref_release);
+}
+
+static void tipc_subscrp_get(struct tipc_subscription *subscription)
+{
+ kref_get(&subscription->kref);
+}
+
+/* tipc_subscrb_subscrp_delete - delete a specific subscription or all
+ * subscriptions for a given subscriber.
+ */
+static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber,
+ struct tipc_subscr *s)
+{
+ struct list_head *subscription_list = &subscriber->subscrp_list;
+ struct tipc_subscription *sub, *temp;
+
+ spin_lock_bh(&subscriber->lock);
+ list_for_each_entry_safe(sub, temp, subscription_list, subscrp_list) {
+ if (s && memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr)))
+ continue;
+
+ tipc_nametbl_unsubscribe(sub);
+ tipc_subscrp_get(sub);
+ spin_unlock_bh(&subscriber->lock);
+ tipc_subscrp_delete(sub);
+ tipc_subscrp_put(sub);
+ spin_lock_bh(&subscriber->lock);
+
+ if (s)
+ break;
+ }
+ spin_unlock_bh(&subscriber->lock);
+}
+
static struct tipc_subscriber *tipc_subscrb_create(int conid)
{
struct tipc_subscriber *subscriber;
@@ -177,8 +231,8 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid)
pr_warn("Subscriber rejected, no memory\n");
return NULL;
}
- kref_init(&subscriber->kref);
INIT_LIST_HEAD(&subscriber->subscrp_list);
+ kref_init(&subscriber->kref);
subscriber->conid = conid;
spin_lock_init(&subscriber->lock);
@@ -187,55 +241,22 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid)
static void tipc_subscrb_delete(struct tipc_subscriber *subscriber)
{
- struct tipc_subscription *sub, *temp;
- u32 timeout;
-
- spin_lock_bh(&subscriber->lock);
- /* Destroy any existing subscriptions for subscriber */
- list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list,
- subscrp_list) {
- timeout = htohl(sub->evt.s.timeout, sub->swap);
- if ((timeout == TIPC_WAIT_FOREVER) || del_timer(&sub->timer)) {
- tipc_subscrp_delete(sub);
- tipc_subscrb_put(subscriber);
- }
- }
- spin_unlock_bh(&subscriber->lock);
-
+ tipc_subscrb_subscrp_delete(subscriber, NULL);
tipc_subscrb_put(subscriber);
}
static void tipc_subscrp_delete(struct tipc_subscription *sub)
{
- struct tipc_net *tn = net_generic(sub->net, tipc_net_id);
+ u32 timeout = htohl(sub->evt.s.timeout, sub->swap);
- tipc_nametbl_unsubscribe(sub);
- list_del(&sub->subscrp_list);
- kfree(sub);
- atomic_dec(&tn->subscription_count);
+ if (timeout == TIPC_WAIT_FOREVER || del_timer(&sub->timer))
+ tipc_subscrp_put(sub);
}
static void tipc_subscrp_cancel(struct tipc_subscr *s,
struct tipc_subscriber *subscriber)
{
- struct tipc_subscription *sub, *temp;
- u32 timeout;
-
- spin_lock_bh(&subscriber->lock);
- /* Find first matching subscription, exit if not found */
- list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list,
- subscrp_list) {
- if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) {
- timeout = htohl(sub->evt.s.timeout, sub->swap);
- if ((timeout == TIPC_WAIT_FOREVER) ||
- del_timer(&sub->timer)) {
- tipc_subscrp_delete(sub);
- tipc_subscrb_put(subscriber);
- }
- break;
- }
- }
- spin_unlock_bh(&subscriber->lock);
+ tipc_subscrb_subscrp_delete(subscriber, s);
}
static struct tipc_subscription *tipc_subscrp_create(struct net *net,
@@ -272,6 +293,7 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net,
sub->swap = swap;
memcpy(&sub->evt.s, s, sizeof(*s));
atomic_inc(&tn->subscription_count);
+ kref_init(&sub->kref);
return sub;
}
@@ -288,17 +310,16 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
spin_lock_bh(&subscriber->lock);
list_add(&sub->subscrp_list, &subscriber->subscrp_list);
- tipc_subscrb_get(subscriber);
sub->subscriber = subscriber;
tipc_nametbl_subscribe(sub);
+ tipc_subscrb_get(subscriber);
spin_unlock_bh(&subscriber->lock);
+ setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub);
timeout = htohl(sub->evt.s.timeout, swap);
- if (timeout == TIPC_WAIT_FOREVER)
- return;
- setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub);
- mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout));
+ if (timeout != TIPC_WAIT_FOREVER)
+ mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout));
}
/* Handle one termination request for the subscriber */
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index be60103082c9..ffdc214c117a 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -57,6 +57,7 @@ struct tipc_subscriber;
* @evt: template for events generated by subscription
*/
struct tipc_subscription {
+ struct kref kref;
struct tipc_subscriber *subscriber;
struct net *net;
struct timer_list timer;
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index b58dc95f3d35..46061cf48cd1 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -113,7 +113,7 @@ static void tipc_udp_media_addr_set(struct tipc_media_addr *addr,
memcpy(addr->value, ua, sizeof(struct udp_media_addr));
if (tipc_udp_is_mcast_addr(ua))
- addr->broadcast = 1;
+ addr->broadcast = TIPC_BROADCAST_SUPPORT;
}
/* tipc_udp_addr2str - convert ip/udp address to string */
@@ -229,7 +229,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
goto out;
}
- if (!addr->broadcast || list_empty(&ub->rcast.list))
+ if (addr->broadcast != TIPC_REPLICAST_SUPPORT)
return tipc_udp_xmit(net, skb, ub, src, dst);
/* Replicast, send an skb to each configured IP address */
@@ -296,7 +296,7 @@ static int tipc_udp_rcast_add(struct tipc_bearer *b,
else if (ntohs(addr->proto) == ETH_P_IPV6)
pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6);
#endif
-
+ b->bcast_addr.broadcast = TIPC_REPLICAST_SUPPORT;
list_add_rcu(&rcast->list, &ub->rcast.list);
return 0;
}
@@ -681,7 +681,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
goto err;
b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP;
- b->bcast_addr.broadcast = 1;
+ b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT;
rcu_assign_pointer(b->media_ptr, ub);
rcu_assign_pointer(ub->bearer, b);
tipc_udp_media_addr_set(&b->addr, &local);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 2358f2690ec5..928691c43408 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -85,7 +85,7 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/signal.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/stat.h>
@@ -100,7 +100,7 @@
#include <linux/in.h>
#include <linux/fs.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <net/net_namespace.h>
@@ -117,6 +117,7 @@
#include <net/checksum.h>
#include <linux/security.h>
#include <linux/freezer.h>
+#include <linux/file.h>
struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
EXPORT_SYMBOL_GPL(unix_socket_table);
@@ -315,7 +316,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
&unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
struct dentry *dentry = unix_sk(s)->path.dentry;
- if (dentry && d_real_inode(dentry) == i) {
+ if (dentry && d_backing_inode(dentry) == i) {
sock_hold(s);
goto found;
}
@@ -635,7 +636,7 @@ static int unix_bind(struct socket *, struct sockaddr *, int);
static int unix_stream_connect(struct socket *, struct sockaddr *,
int addr_len, int flags);
static int unix_socketpair(struct socket *, struct socket *);
-static int unix_accept(struct socket *, struct socket *, int);
+static int unix_accept(struct socket *, struct socket *, int, bool);
static int unix_getname(struct socket *, struct sockaddr *, int *, int);
static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
static unsigned int unix_dgram_poll(struct file *, struct socket *,
@@ -913,7 +914,7 @@ static struct sock *unix_find_other(struct net *net,
err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
if (err)
goto fail;
- inode = d_real_inode(path.dentry);
+ inode = d_backing_inode(path.dentry);
err = inode_permission(inode, MAY_WRITE);
if (err)
goto put_fail;
@@ -995,6 +996,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
unsigned int hash;
struct unix_address *addr;
struct hlist_head *list;
+ struct path path = { NULL, NULL };
err = -EINVAL;
if (sunaddr->sun_family != AF_UNIX)
@@ -1010,9 +1012,20 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out;
addr_len = err;
+ if (sun_path[0]) {
+ umode_t mode = S_IFSOCK |
+ (SOCK_INODE(sock)->i_mode & ~current_umask());
+ err = unix_mknod(sun_path, mode, &path);
+ if (err) {
+ if (err == -EEXIST)
+ err = -EADDRINUSE;
+ goto out;
+ }
+ }
+
err = mutex_lock_interruptible(&u->bindlock);
if (err)
- goto out;
+ goto out_put;
err = -EINVAL;
if (u->addr)
@@ -1029,18 +1042,8 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
atomic_set(&addr->refcnt, 1);
if (sun_path[0]) {
- struct path path;
- umode_t mode = S_IFSOCK |
- (SOCK_INODE(sock)->i_mode & ~current_umask());
- err = unix_mknod(sun_path, mode, &path);
- if (err) {
- if (err == -EEXIST)
- err = -EADDRINUSE;
- unix_release_addr(addr);
- goto out_up;
- }
addr->hash = UNIX_HASH_SIZE;
- hash = d_real_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
+ hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
spin_lock(&unix_table_lock);
u->path = path;
list = &unix_socket_table[hash];
@@ -1065,6 +1068,9 @@ out_unlock:
spin_unlock(&unix_table_lock);
out_up:
mutex_unlock(&u->bindlock);
+out_put:
+ if (err)
+ path_put(&path);
out:
return err;
}
@@ -1396,7 +1402,8 @@ static void unix_sock_inherit_flags(const struct socket *old,
set_bit(SOCK_PASSSEC, &new->flags);
}
-static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
+static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern)
{
struct sock *sk = sock->sk;
struct sock *tsk;
@@ -2113,8 +2120,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
mutex_lock(&u->iolock);
skip = sk_peek_offset(sk, flags);
- skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
- &last);
+ skb = __skb_try_recv_datagram(sk, flags, NULL, &peeked, &skip,
+ &err, &last);
if (skb)
break;
@@ -2587,6 +2594,43 @@ long unix_outq_len(struct sock *sk)
}
EXPORT_SYMBOL_GPL(unix_outq_len);
+static int unix_open_file(struct sock *sk)
+{
+ struct path path;
+ struct file *f;
+ int fd;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ unix_state_lock(sk);
+ path = unix_sk(sk)->path;
+ if (!path.dentry) {
+ unix_state_unlock(sk);
+ return -ENOENT;
+ }
+
+ path_get(&path);
+ unix_state_unlock(sk);
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0)
+ goto out;
+
+ f = dentry_open(&path, O_PATH, current_cred());
+ if (IS_ERR(f)) {
+ put_unused_fd(fd);
+ fd = PTR_ERR(f);
+ goto out;
+ }
+
+ fd_install(fd, f);
+out:
+ path_put(&path);
+
+ return fd;
+}
+
static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
struct sock *sk = sock->sk;
@@ -2605,6 +2649,9 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
else
err = put_user(amount, (int __user *)arg);
break;
+ case SIOCUNIXFILE:
+ err = unix_open_file(sk);
+ break;
default:
err = -ENOIOCTLCMD;
break;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 6a0d48525fcf..c36757e72844 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -146,6 +146,7 @@ void unix_notinflight(struct user_struct *user, struct file *fp)
if (s) {
struct unix_sock *u = unix_sk(s);
+ BUG_ON(!atomic_long_read(&u->inflight));
BUG_ON(list_empty(&u->link));
if (atomic_long_dec_and_test(&u->inflight))
@@ -341,6 +342,14 @@ void unix_gc(void)
}
list_del(&cursor);
+ /* Now gc_candidates contains only garbage. Restore original
+ * inflight counters for these as well, and remove the skbuffs
+ * which are creating the cycle(s).
+ */
+ skb_queue_head_init(&hitlist);
+ list_for_each_entry(u, &gc_candidates, link)
+ scan_children(&u->sk, inc_inflight, &hitlist);
+
/* not_cycle_list contains those sockets which do not make up a
* cycle. Restore these to the inflight list.
*/
@@ -350,14 +359,6 @@ void unix_gc(void)
list_move_tail(&u->link, &gc_inflight_list);
}
- /* Now gc_candidates contains only garbage. Restore original
- * inflight counters for these as well, and remove the skbuffs
- * which are creating the cycle(s).
- */
- skb_queue_head_init(&hitlist);
- list_for_each_entry(u, &gc_candidates, link)
- scan_children(&u->sk, inc_inflight, &hitlist);
-
spin_unlock(&unix_gc_lock);
/* Here we are. Hitlist is filled. Die. */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 8a398b3fb532..6f7f6757ceef 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -90,6 +90,7 @@
#include <linux/init.h>
#include <linux/io.h>
#include <linux/kernel.h>
+#include <linux/sched/signal.h>
#include <linux/kmod.h>
#include <linux/list.h>
#include <linux/miscdevice.h>
@@ -1101,10 +1102,19 @@ static const struct proto_ops vsock_dgram_ops = {
.sendpage = sock_no_sendpage,
};
+static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
+{
+ if (!transport->cancel_pkt)
+ return -EOPNOTSUPP;
+
+ return transport->cancel_pkt(vsk);
+}
+
static void vsock_connect_timeout(struct work_struct *work)
{
struct sock *sk;
struct vsock_sock *vsk;
+ int cancel = 0;
vsk = container_of(work, struct vsock_sock, dwork.work);
sk = sk_vsock(vsk);
@@ -1115,8 +1125,11 @@ static void vsock_connect_timeout(struct work_struct *work)
sk->sk_state = SS_UNCONNECTED;
sk->sk_err = ETIMEDOUT;
sk->sk_error_report(sk);
+ cancel = 1;
}
release_sock(sk);
+ if (cancel)
+ vsock_transport_cancel_pkt(vsk);
sock_put(sk);
}
@@ -1223,11 +1236,13 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
err = sock_intr_errno(timeout);
sk->sk_state = SS_UNCONNECTED;
sock->state = SS_UNCONNECTED;
+ vsock_transport_cancel_pkt(vsk);
goto out_wait;
} else if (timeout == 0) {
err = -ETIMEDOUT;
sk->sk_state = SS_UNCONNECTED;
sock->state = SS_UNCONNECTED;
+ vsock_transport_cancel_pkt(vsk);
goto out_wait;
}
@@ -1249,7 +1264,8 @@ out:
return err;
}
-static int vsock_accept(struct socket *sock, struct socket *newsock, int flags)
+static int vsock_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern)
{
struct sock *listener;
int err;
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 936d7eee62d0..68675a151f22 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -44,6 +44,10 @@ struct virtio_vsock {
spinlock_t send_pkt_list_lock;
struct list_head send_pkt_list;
+ struct work_struct loopback_work;
+ spinlock_t loopback_list_lock; /* protects loopback_list */
+ struct list_head loopback_list;
+
atomic_t queued_replies;
/* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX]
@@ -74,6 +78,42 @@ static u32 virtio_transport_get_local_cid(void)
return vsock->guest_cid;
}
+static void virtio_transport_loopback_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, loopback_work);
+ LIST_HEAD(pkts);
+
+ spin_lock_bh(&vsock->loopback_list_lock);
+ list_splice_init(&vsock->loopback_list, &pkts);
+ spin_unlock_bh(&vsock->loopback_list_lock);
+
+ mutex_lock(&vsock->rx_lock);
+ while (!list_empty(&pkts)) {
+ struct virtio_vsock_pkt *pkt;
+
+ pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list);
+ list_del_init(&pkt->list);
+
+ virtio_transport_recv_pkt(pkt);
+ }
+ mutex_unlock(&vsock->rx_lock);
+}
+
+static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
+ struct virtio_vsock_pkt *pkt)
+{
+ int len = pkt->len;
+
+ spin_lock_bh(&vsock->loopback_list_lock);
+ list_add_tail(&pkt->list, &vsock->loopback_list);
+ spin_unlock_bh(&vsock->loopback_list_lock);
+
+ queue_work(virtio_vsock_workqueue, &vsock->loopback_work);
+
+ return len;
+}
+
static void
virtio_transport_send_pkt_work(struct work_struct *work)
{
@@ -159,6 +199,9 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
return -ENODEV;
}
+ if (le32_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid)
+ return virtio_transport_send_pkt_loopback(vsock, pkt);
+
if (pkt->reply)
atomic_inc(&vsock->queued_replies);
@@ -170,6 +213,47 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
return len;
}
+static int
+virtio_transport_cancel_pkt(struct vsock_sock *vsk)
+{
+ struct virtio_vsock *vsock;
+ struct virtio_vsock_pkt *pkt, *n;
+ int cnt = 0;
+ LIST_HEAD(freeme);
+
+ vsock = virtio_vsock_get();
+ if (!vsock) {
+ return -ENODEV;
+ }
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) {
+ if (pkt->vsk != vsk)
+ continue;
+ list_move(&pkt->list, &freeme);
+ }
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ list_for_each_entry_safe(pkt, n, &freeme, list) {
+ if (pkt->reply)
+ cnt++;
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+
+ if (cnt) {
+ struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX];
+ int new_cnt;
+
+ new_cnt = atomic_sub_return(cnt, &vsock->queued_replies);
+ if (new_cnt + cnt >= virtqueue_get_vring_size(rx_vq) &&
+ new_cnt < virtqueue_get_vring_size(rx_vq))
+ queue_work(virtio_vsock_workqueue, &vsock->rx_work);
+ }
+
+ return 0;
+}
+
static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
{
int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
@@ -336,7 +420,7 @@ static void virtio_vsock_reset_sock(struct sock *sk)
static void virtio_vsock_update_guest_cid(struct virtio_vsock *vsock)
{
struct virtio_device *vdev = vsock->vdev;
- u64 guest_cid;
+ __le64 guest_cid;
vdev->config->get(vdev, offsetof(struct virtio_vsock_config, guest_cid),
&guest_cid, sizeof(guest_cid));
@@ -419,6 +503,7 @@ static struct virtio_transport virtio_transport = {
.release = virtio_transport_release,
.connect = virtio_transport_connect,
.shutdown = virtio_transport_shutdown,
+ .cancel_pkt = virtio_transport_cancel_pkt,
.dgram_bind = virtio_transport_dgram_bind,
.dgram_dequeue = virtio_transport_dgram_dequeue,
@@ -489,7 +574,8 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
vsock->vdev = vdev;
ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX,
- vsock->vqs, callbacks, names);
+ vsock->vqs, callbacks, names,
+ NULL);
if (ret < 0)
goto out;
@@ -510,10 +596,13 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
mutex_init(&vsock->event_lock);
spin_lock_init(&vsock->send_pkt_list_lock);
INIT_LIST_HEAD(&vsock->send_pkt_list);
+ spin_lock_init(&vsock->loopback_list_lock);
+ INIT_LIST_HEAD(&vsock->loopback_list);
INIT_WORK(&vsock->rx_work, virtio_transport_rx_work);
INIT_WORK(&vsock->tx_work, virtio_transport_tx_work);
INIT_WORK(&vsock->event_work, virtio_transport_event_work);
INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work);
+ INIT_WORK(&vsock->loopback_work, virtio_transport_loopback_work);
mutex_lock(&vsock->rx_lock);
virtio_vsock_rx_fill(vsock);
@@ -539,6 +628,7 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
struct virtio_vsock *vsock = vdev->priv;
struct virtio_vsock_pkt *pkt;
+ flush_work(&vsock->loopback_work);
flush_work(&vsock->rx_work);
flush_work(&vsock->tx_work);
flush_work(&vsock->event_work);
@@ -565,6 +655,15 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
}
spin_unlock_bh(&vsock->send_pkt_list_lock);
+ spin_lock_bh(&vsock->loopback_list_lock);
+ while (!list_empty(&vsock->loopback_list)) {
+ pkt = list_first_entry(&vsock->loopback_list,
+ struct virtio_vsock_pkt, list);
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+ spin_unlock_bh(&vsock->loopback_list_lock);
+
mutex_lock(&the_virtio_vsock_mutex);
the_virtio_vsock = NULL;
vsock_core_exit();
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index a53b3a16b4f1..af087b44ceea 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -9,6 +9,7 @@
*/
#include <linux/spinlock.h>
#include <linux/module.h>
+#include <linux/sched/signal.h>
#include <linux/ctype.h>
#include <linux/list.h>
#include <linux/virtio.h>
@@ -32,7 +33,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
return container_of(t, struct virtio_transport, transport);
}
-struct virtio_vsock_pkt *
+static struct virtio_vsock_pkt *
virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
size_t len,
u32 src_cid,
@@ -57,6 +58,7 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
pkt->len = len;
pkt->hdr.len = cpu_to_le32(len);
pkt->reply = info->reply;
+ pkt->vsk = info->vsk;
if (info->msg && len > 0) {
pkt->buf = kmalloc(len, GFP_KERNEL);
@@ -82,7 +84,6 @@ out_pkt:
kfree(pkt);
return NULL;
}
-EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt);
static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
struct virtio_vsock_pkt_info *info)
@@ -180,6 +181,7 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
.type = type,
+ .vsk = vsk,
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -519,6 +521,7 @@ int virtio_transport_connect(struct vsock_sock *vsk)
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_REQUEST,
.type = VIRTIO_VSOCK_TYPE_STREAM,
+ .vsk = vsk,
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -534,6 +537,7 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
VIRTIO_VSOCK_SHUTDOWN_RCV : 0) |
(mode & SEND_SHUTDOWN ?
VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
+ .vsk = vsk,
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -560,6 +564,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
.type = VIRTIO_VSOCK_TYPE_STREAM,
.msg = msg,
.pkt_len = len,
+ .vsk = vsk,
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -581,6 +586,7 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
.op = VIRTIO_VSOCK_OP_RST,
.type = VIRTIO_VSOCK_TYPE_STREAM,
.reply = !!pkt,
+ .vsk = vsk,
};
/* Send RST only if the original pkt is not a RST pkt */
@@ -606,9 +612,9 @@ static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
return 0;
pkt = virtio_transport_alloc_pkt(&info, 0,
- le32_to_cpu(pkt->hdr.dst_cid),
+ le64_to_cpu(pkt->hdr.dst_cid),
le32_to_cpu(pkt->hdr.dst_port),
- le32_to_cpu(pkt->hdr.src_cid),
+ le64_to_cpu(pkt->hdr.src_cid),
le32_to_cpu(pkt->hdr.src_port));
if (!pkt)
return -ENOMEM;
@@ -619,17 +625,17 @@ static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
static void virtio_transport_wait_close(struct sock *sk, long timeout)
{
if (timeout) {
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ add_wait_queue(sk_sleep(sk), &wait);
do {
- prepare_to_wait(sk_sleep(sk), &wait,
- TASK_INTERRUPTIBLE);
if (sk_wait_event(sk, &timeout,
- sock_flag(sk, SOCK_DONE)))
+ sock_flag(sk, SOCK_DONE), &wait))
break;
} while (!signal_pending(current) && timeout);
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
}
}
@@ -823,9 +829,10 @@ virtio_transport_send_response(struct vsock_sock *vsk,
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_RESPONSE,
.type = VIRTIO_VSOCK_TYPE_STREAM,
- .remote_cid = le32_to_cpu(pkt->hdr.src_cid),
+ .remote_cid = le64_to_cpu(pkt->hdr.src_cid),
.remote_port = le32_to_cpu(pkt->hdr.src_port),
.reply = true,
+ .vsk = vsk,
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -863,9 +870,9 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
child->sk_state = SS_CONNECTED;
vchild = vsock_sk(child);
- vsock_addr_init(&vchild->local_addr, le32_to_cpu(pkt->hdr.dst_cid),
+ vsock_addr_init(&vchild->local_addr, le64_to_cpu(pkt->hdr.dst_cid),
le32_to_cpu(pkt->hdr.dst_port));
- vsock_addr_init(&vchild->remote_addr, le32_to_cpu(pkt->hdr.src_cid),
+ vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid),
le32_to_cpu(pkt->hdr.src_port));
vsock_insert_connected(vchild);
@@ -904,9 +911,9 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
struct sock *sk;
bool space_available;
- vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid),
+ vsock_addr_init(&src, le64_to_cpu(pkt->hdr.src_cid),
le32_to_cpu(pkt->hdr.src_port));
- vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid),
+ vsock_addr_init(&dst, le64_to_cpu(pkt->hdr.dst_cid),
le32_to_cpu(pkt->hdr.dst_port));
trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port,
diff --git a/net/vmw_vsock/vmci_transport_notify.c b/net/vmw_vsock/vmci_transport_notify.c
index fd8cf0214d51..1406db4d97d1 100644
--- a/net/vmw_vsock/vmci_transport_notify.c
+++ b/net/vmw_vsock/vmci_transport_notify.c
@@ -662,19 +662,19 @@ static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk)
/* Socket control packet based operations. */
const struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = {
- vmci_transport_notify_pkt_socket_init,
- vmci_transport_notify_pkt_socket_destruct,
- vmci_transport_notify_pkt_poll_in,
- vmci_transport_notify_pkt_poll_out,
- vmci_transport_notify_pkt_handle_pkt,
- vmci_transport_notify_pkt_recv_init,
- vmci_transport_notify_pkt_recv_pre_block,
- vmci_transport_notify_pkt_recv_pre_dequeue,
- vmci_transport_notify_pkt_recv_post_dequeue,
- vmci_transport_notify_pkt_send_init,
- vmci_transport_notify_pkt_send_pre_block,
- vmci_transport_notify_pkt_send_pre_enqueue,
- vmci_transport_notify_pkt_send_post_enqueue,
- vmci_transport_notify_pkt_process_request,
- vmci_transport_notify_pkt_process_negotiate,
+ .socket_init = vmci_transport_notify_pkt_socket_init,
+ .socket_destruct = vmci_transport_notify_pkt_socket_destruct,
+ .poll_in = vmci_transport_notify_pkt_poll_in,
+ .poll_out = vmci_transport_notify_pkt_poll_out,
+ .handle_notify_pkt = vmci_transport_notify_pkt_handle_pkt,
+ .recv_init = vmci_transport_notify_pkt_recv_init,
+ .recv_pre_block = vmci_transport_notify_pkt_recv_pre_block,
+ .recv_pre_dequeue = vmci_transport_notify_pkt_recv_pre_dequeue,
+ .recv_post_dequeue = vmci_transport_notify_pkt_recv_post_dequeue,
+ .send_init = vmci_transport_notify_pkt_send_init,
+ .send_pre_block = vmci_transport_notify_pkt_send_pre_block,
+ .send_pre_enqueue = vmci_transport_notify_pkt_send_pre_enqueue,
+ .send_post_enqueue = vmci_transport_notify_pkt_send_post_enqueue,
+ .process_request = vmci_transport_notify_pkt_process_request,
+ .process_negotiate = vmci_transport_notify_pkt_process_negotiate,
};
diff --git a/net/vmw_vsock/vmci_transport_notify_qstate.c b/net/vmw_vsock/vmci_transport_notify_qstate.c
index 21e591dafb03..f3a0afc46208 100644
--- a/net/vmw_vsock/vmci_transport_notify_qstate.c
+++ b/net/vmw_vsock/vmci_transport_notify_qstate.c
@@ -420,19 +420,19 @@ vmci_transport_notify_pkt_send_pre_enqueue(
/* Socket always on control packet based operations. */
const struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops = {
- vmci_transport_notify_pkt_socket_init,
- vmci_transport_notify_pkt_socket_destruct,
- vmci_transport_notify_pkt_poll_in,
- vmci_transport_notify_pkt_poll_out,
- vmci_transport_notify_pkt_handle_pkt,
- vmci_transport_notify_pkt_recv_init,
- vmci_transport_notify_pkt_recv_pre_block,
- vmci_transport_notify_pkt_recv_pre_dequeue,
- vmci_transport_notify_pkt_recv_post_dequeue,
- vmci_transport_notify_pkt_send_init,
- vmci_transport_notify_pkt_send_pre_block,
- vmci_transport_notify_pkt_send_pre_enqueue,
- vmci_transport_notify_pkt_send_post_enqueue,
- vmci_transport_notify_pkt_process_request,
- vmci_transport_notify_pkt_process_negotiate,
+ .socket_init = vmci_transport_notify_pkt_socket_init,
+ .socket_destruct = vmci_transport_notify_pkt_socket_destruct,
+ .poll_in = vmci_transport_notify_pkt_poll_in,
+ .poll_out = vmci_transport_notify_pkt_poll_out,
+ .handle_notify_pkt = vmci_transport_notify_pkt_handle_pkt,
+ .recv_init = vmci_transport_notify_pkt_recv_init,
+ .recv_pre_block = vmci_transport_notify_pkt_recv_pre_block,
+ .recv_pre_dequeue = vmci_transport_notify_pkt_recv_pre_dequeue,
+ .recv_post_dequeue = vmci_transport_notify_pkt_recv_post_dequeue,
+ .send_init = vmci_transport_notify_pkt_send_init,
+ .send_pre_block = vmci_transport_notify_pkt_send_pre_block,
+ .send_pre_enqueue = vmci_transport_notify_pkt_send_pre_enqueue,
+ .send_post_enqueue = vmci_transport_notify_pkt_send_post_enqueue,
+ .process_request = vmci_transport_notify_pkt_process_request,
+ .process_negotiate = vmci_transport_notify_pkt_process_negotiate,
};
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 3f816e2971ee..5db731512014 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -572,16 +572,20 @@ struct d_level D_LEVEL[] = {
size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
-struct genl_family wimax_gnl_family = {
- .id = GENL_ID_GENERATE,
+static const struct genl_multicast_group wimax_gnl_mcgrps[] = {
+ { .name = "msg", },
+};
+
+struct genl_family wimax_gnl_family __ro_after_init = {
.name = "WiMAX",
.version = WIMAX_GNL_VERSION,
.hdrsize = 0,
.maxattr = WIMAX_GNL_ATTR_MAX,
-};
-
-static const struct genl_multicast_group wimax_gnl_mcgrps[] = {
- { .name = "msg", },
+ .module = THIS_MODULE,
+ .ops = wimax_gnl_ops,
+ .n_ops = ARRAY_SIZE(wimax_gnl_ops),
+ .mcgrps = wimax_gnl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(wimax_gnl_mcgrps),
};
@@ -596,11 +600,7 @@ int __init wimax_subsys_init(void)
d_parse_params(D_LEVEL, D_LEVEL_SIZE, wimax_debug_params,
"wimax.debug");
- snprintf(wimax_gnl_family.name, sizeof(wimax_gnl_family.name),
- "WiMAX");
- result = genl_register_family_with_ops_groups(&wimax_gnl_family,
- wimax_gnl_ops,
- wimax_gnl_mcgrps);
+ result = genl_register_family(&wimax_gnl_family);
if (unlikely(result < 0)) {
pr_err("cannot register generic netlink family: %d\n", result);
goto error_register_family;
diff --git a/net/wireless/Makefile b/net/wireless/Makefile
index 4c9e39f04ef8..d06e5015751a 100644
--- a/net/wireless/Makefile
+++ b/net/wireless/Makefile
@@ -11,14 +11,13 @@ obj-$(CONFIG_WEXT_PRIV) += wext-priv.o
cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o
cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o ap.o trace.o ocb.o
+cfg80211-$(CONFIG_OF) += of.o
cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o
cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o
cfg80211-$(CONFIG_CFG80211_INTERNAL_REGDB) += regdb.o
CFLAGS_trace.o := -I$(src)
-ccflags-y += -D__CHECK_ENDIAN__
-
$(obj)/regdb.c: $(src)/db.txt $(src)/genregdb.awk
@$(AWK) -f $(srctree)/$(src)/genregdb.awk < $< > $@
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 8201e6d7449e..e55e05bc4805 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -210,11 +210,11 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
if (WARN_ON(wdev->iftype != NL80211_IFTYPE_P2P_DEVICE))
return;
- if (!wdev->p2p_started)
+ if (!wdev_running(wdev))
return;
rdev_stop_p2p_device(rdev, wdev);
- wdev->p2p_started = false;
+ wdev->is_running = false;
rdev->opencount--;
@@ -233,11 +233,11 @@ void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
if (WARN_ON(wdev->iftype != NL80211_IFTYPE_NAN))
return;
- if (!wdev->nan_started)
+ if (!wdev_running(wdev))
return;
rdev_stop_nan(rdev, wdev);
- wdev->nan_started = false;
+ wdev->is_running = false;
rdev->opencount--;
}
@@ -562,6 +562,21 @@ static int wiphy_verify_combinations(struct wiphy *wiphy)
c->limits[j].max > 1))
return -EINVAL;
+ /*
+ * This isn't well-defined right now. If you have an
+ * IBSS interface, then its beacon interval may change
+ * by joining other networks, and nothing prevents it
+ * from doing that.
+ * So technically we probably shouldn't even allow AP
+ * and IBSS in the same interface, but it seems that
+ * some drivers support that, possibly only with fixed
+ * beacon intervals for IBSS.
+ */
+ if (WARN_ON(types & BIT(NL80211_IFTYPE_ADHOC) &&
+ c->beacon_int_min_gcd)) {
+ return -EINVAL;
+ }
+
cnt += c->limits[j].max;
/*
* Don't advertise an unsupported type
@@ -571,6 +586,11 @@ static int wiphy_verify_combinations(struct wiphy *wiphy)
return -EINVAL;
}
+#ifndef CONFIG_WIRELESS_WDS
+ if (WARN_ON(all_iftypes & BIT(NL80211_IFTYPE_WDS)))
+ return -EINVAL;
+#endif
+
/* You can't even choose that many! */
if (WARN_ON(cnt < c->max_interfaces))
return -EINVAL;
@@ -606,8 +626,14 @@ int wiphy_register(struct wiphy *wiphy)
if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) &&
(!rdev->ops->start_nan || !rdev->ops->stop_nan ||
- !rdev->ops->add_nan_func || !rdev->ops->del_nan_func)))
+ !rdev->ops->add_nan_func || !rdev->ops->del_nan_func ||
+ !(wiphy->nan_supported_bands & BIT(NL80211_BAND_2GHZ)))))
+ return -EINVAL;
+
+#ifndef CONFIG_WIRELESS_WDS
+ if (WARN_ON(wiphy->interface_modes & BIT(NL80211_IFTYPE_WDS)))
return -EINVAL;
+#endif
/*
* if a wiphy has unsupported modes for regulatory channel enforcement,
@@ -1117,6 +1143,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
dev->priv_flags |= IFF_DONT_BRIDGE;
+ INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk);
+
nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
break;
case NETDEV_GOING_DOWN:
@@ -1205,6 +1233,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
#ifdef CONFIG_CFG80211_WEXT
kzfree(wdev->wext.keys);
#endif
+ flush_work(&wdev->disconnect_wk);
}
/*
* synchronise (so that we won't find this netdev
diff --git a/net/wireless/core.h b/net/wireless/core.h
index f0c0c8a48c92..58ca206982fe 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -228,6 +228,7 @@ struct cfg80211_event {
size_t resp_ie_len;
struct cfg80211_bss *bss;
int status; /* -1 = failed; 0..65535 = status code */
+ enum nl80211_timeout_reason timeout_reason;
} cr;
struct {
const u8 *req_ie;
@@ -346,7 +347,7 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
const u8 *ssid, int ssid_len,
const u8 *ie, int ie_len,
const u8 *key, int key_len, int key_idx,
- const u8 *sae_data, int sae_data_len);
+ const u8 *auth_data, int auth_data_len);
int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
struct net_device *dev,
struct ieee80211_channel *chan,
@@ -388,7 +389,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
const u8 *req_ie, size_t req_ie_len,
const u8 *resp_ie, size_t resp_ie_len,
int status, bool wextev,
- struct cfg80211_bss *bss);
+ struct cfg80211_bss *bss,
+ enum nl80211_timeout_reason timeout_reason);
void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
size_t ie_len, u16 reason, bool from_ap);
int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
@@ -400,6 +402,7 @@ void __cfg80211_roamed(struct wireless_dev *wdev,
const u8 *resp_ie, size_t resp_ie_len);
int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
struct wireless_dev *wdev);
+void cfg80211_autodisconnect_wk(struct work_struct *work);
/* SME implementation */
void cfg80211_conn_work(struct work_struct *work);
@@ -410,6 +413,7 @@ void cfg80211_sme_disassoc(struct wireless_dev *wdev);
void cfg80211_sme_deauth(struct wireless_dev *wdev);
void cfg80211_sme_auth_timeout(struct wireless_dev *wdev);
void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev);
+void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev);
/* internal helpers */
bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher);
@@ -429,6 +433,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev);
void cfg80211_process_wdev_events(struct wireless_dev *wdev);
+bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
+ u32 center_freq_khz, u32 bw_khz);
+
/**
* cfg80211_chandef_dfs_usable - checks if chandef is DFS usable
* @wiphy: the wiphy to validate against
@@ -476,7 +483,7 @@ int ieee80211_get_ratemask(struct ieee80211_supported_band *sband,
u32 *mask);
int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
- u32 beacon_int);
+ enum nl80211_iftype iftype, u32 beacon_int);
void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev,
enum nl80211_iftype iftype, int num);
diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c
index 5d453916a417..30fc6eb352bc 100644
--- a/net/wireless/debugfs.c
+++ b/net/wireless/debugfs.c
@@ -17,7 +17,7 @@
static ssize_t name## _read(struct file *file, char __user *userbuf, \
size_t count, loff_t *ppos) \
{ \
- struct wiphy *wiphy= file->private_data; \
+ struct wiphy *wiphy = file->private_data; \
char buf[buflen]; \
int res; \
\
@@ -29,14 +29,14 @@ static const struct file_operations name## _ops = { \
.read = name## _read, \
.open = simple_open, \
.llseek = generic_file_llseek, \
-};
+}
DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d",
- wiphy->rts_threshold)
+ wiphy->rts_threshold);
DEBUGFS_READONLY_FILE(fragmentation_threshold, 20, "%d",
wiphy->frag_threshold);
DEBUGFS_READONLY_FILE(short_retry_limit, 20, "%d",
- wiphy->retry_short)
+ wiphy->retry_short);
DEBUGFS_READONLY_FILE(long_retry_limit, 20, "%d",
wiphy->retry_long);
@@ -103,7 +103,7 @@ static const struct file_operations ht40allow_map_ops = {
};
#define DEBUGFS_ADD(name) \
- debugfs_create_file(#name, S_IRUGO, phyd, &rdev->wiphy, &name## _ops);
+ debugfs_create_file(#name, 0444, phyd, &rdev->wiphy, &name## _ops)
void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev)
{
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index 71447cf86306..ba0a1f398ce5 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -556,7 +556,7 @@ static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr)
memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */
memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN); /* SA */
break;
- case 0:
+ default:
memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */
memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */
break;
diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c
index fa2066b56f36..2d8518a37eab 100644
--- a/net/wireless/mesh.c
+++ b/net/wireless/mesh.c
@@ -183,6 +183,7 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
memcpy(wdev->ssid, setup->mesh_id, setup->mesh_id_len);
wdev->mesh_id_len = setup->mesh_id_len;
wdev->chandef = setup->chandef;
+ wdev->beacon_interval = setup->beacon_interval;
}
return err;
@@ -258,6 +259,7 @@ int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
err = rdev_leave_mesh(rdev, dev);
if (!err) {
wdev->mesh_id_len = 0;
+ wdev->beacon_interval = 0;
memset(&wdev->chandef, 0, sizeof(wdev->chandef));
rdev_set_qos_map(rdev, dev, NULL);
}
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index cbb48e26a871..22b3d9990065 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -48,7 +48,8 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss,
/* update current_bss etc., consumes the bss reference */
__cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, ie, len - ieoffs,
status_code,
- status_code == WLAN_STATUS_SUCCESS, bss);
+ status_code == WLAN_STATUS_SUCCESS, bss,
+ NL80211_TIMEOUT_UNSPECIFIED);
}
EXPORT_SYMBOL(cfg80211_rx_assoc_resp);
@@ -149,6 +150,18 @@ void cfg80211_assoc_timeout(struct net_device *dev, struct cfg80211_bss *bss)
}
EXPORT_SYMBOL(cfg80211_assoc_timeout);
+void cfg80211_abandon_assoc(struct net_device *dev, struct cfg80211_bss *bss)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct wiphy *wiphy = wdev->wiphy;
+
+ cfg80211_sme_abandon_assoc(wdev);
+
+ cfg80211_unhold_bss(bss_from_pub(bss));
+ cfg80211_put_bss(wiphy, bss);
+}
+EXPORT_SYMBOL(cfg80211_abandon_assoc);
+
void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
@@ -204,14 +217,14 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
const u8 *ssid, int ssid_len,
const u8 *ie, int ie_len,
const u8 *key, int key_len, int key_idx,
- const u8 *sae_data, int sae_data_len)
+ const u8 *auth_data, int auth_data_len)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_auth_request req = {
.ie = ie,
.ie_len = ie_len,
- .sae_data = sae_data,
- .sae_data_len = sae_data_len,
+ .auth_data = auth_data,
+ .auth_data_len = auth_data_len,
.auth_type = auth_type,
.key = key,
.key_len = key_len,
@@ -333,6 +346,11 @@ int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
!ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
return 0;
+ if (ether_addr_equal(wdev->disconnect_bssid, bssid) ||
+ (wdev->current_bss &&
+ ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
+ wdev->conn_owner_nlportid = 0;
+
return rdev_deauth(rdev, dev, &req);
}
@@ -645,8 +663,25 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
return err;
}
- if (!ether_addr_equal(mgmt->sa, wdev_address(wdev)))
- return -EINVAL;
+ if (!ether_addr_equal(mgmt->sa, wdev_address(wdev))) {
+ /* Allow random TA to be used with Public Action frames if the
+ * driver has indicated support for this. Otherwise, only allow
+ * the local address to be used.
+ */
+ if (!ieee80211_is_action(mgmt->frame_control) ||
+ mgmt->u.action.category != WLAN_CATEGORY_PUBLIC)
+ return -EINVAL;
+ if (!wdev->current_bss &&
+ !wiphy_ext_feature_isset(
+ &rdev->wiphy,
+ NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA))
+ return -EINVAL;
+ if (wdev->current_bss &&
+ !wiphy_ext_feature_isset(
+ &rdev->wiphy,
+ NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED))
+ return -EINVAL;
+ }
/* Transmit the Action frame as requested by user space */
return rdev_mgmt_tx(rdev, wdev, params, cookie);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c510810f0b7c..2312dc2ffdb9 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3,7 +3,7 @@
*
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
- * Copyright 2015-2016 Intel Deutschland GmbH
+ * Copyright 2015-2017 Intel Deutschland GmbH
*/
#include <linux/if.h>
@@ -32,22 +32,8 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
struct cfg80211_crypto_settings *settings,
int cipher_limit);
-static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
- struct genl_info *info);
-static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
- struct genl_info *info);
-
/* the netlink family */
-static struct genl_family nl80211_fam = {
- .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */
- .name = NL80211_GENL_NAME, /* have users key off the name instead */
- .hdrsize = 0, /* no private header */
- .version = 1, /* no particular meaning now */
- .maxattr = NL80211_ATTR_MAX,
- .netnsok = true,
- .pre_doit = nl80211_pre_doit,
- .post_doit = nl80211_post_doit,
-};
+static struct genl_family nl80211_fam;
/* multicast groups */
enum nl80211_multicast_groups {
@@ -357,7 +343,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_BG_SCAN_PERIOD] = { .type = NLA_U16 },
[NL80211_ATTR_WDEV] = { .type = NLA_U64 },
[NL80211_ATTR_USER_REG_HINT_TYPE] = { .type = NLA_U32 },
- [NL80211_ATTR_SAE_DATA] = { .type = NLA_BINARY, },
+ [NL80211_ATTR_AUTH_DATA] = { .type = NLA_BINARY, },
[NL80211_ATTR_VHT_CAPABILITY] = { .len = NL80211_VHT_CAPABILITY_LEN },
[NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 },
[NL80211_ATTR_P2P_CTWINDOW] = { .type = NLA_U8 },
@@ -412,8 +398,18 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
},
[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN },
[NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 },
- [NL80211_ATTR_NAN_DUAL] = { .type = NLA_U8 },
+ [NL80211_ATTR_BANDS] = { .type = NLA_U32 },
[NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED },
+ [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY,
+ .len = FILS_MAX_KEK_LEN },
+ [NL80211_ATTR_FILS_NONCES] = { .len = 2 * FILS_NONCE_LEN },
+ [NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED] = { .type = NLA_FLAG, },
+ [NL80211_ATTR_BSSID] = { .len = ETH_ALEN },
+ [NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] = { .type = NLA_S8 },
+ [NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST] = {
+ .len = sizeof(struct nl80211_bss_select_rssi_adjust)
+ },
+ [NL80211_ATTR_TIMEOUT_REASON] = { .type = NLA_U32 },
};
/* policy for the key attributes */
@@ -435,6 +431,7 @@ nl80211_key_default_policy[NUM_NL80211_KEY_DEFAULT_TYPES] = {
[NL80211_KEY_DEFAULT_TYPE_MULTICAST] = { .type = NLA_FLAG },
};
+#ifdef CONFIG_PM
/* policy for WoWLAN attributes */
static const struct nla_policy
nl80211_wowlan_policy[NUM_NL80211_WOWLAN_TRIG] = {
@@ -468,6 +465,7 @@ nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = {
[NL80211_WOWLAN_TCP_WAKE_PAYLOAD] = { .len = 1 },
[NL80211_WOWLAN_TCP_WAKE_MASK] = { .len = 1 },
};
+#endif /* CONFIG_PM */
/* policy for coalesce rule attributes */
static const struct nla_policy
@@ -547,21 +545,18 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
{
int err;
- rtnl_lock();
-
if (!cb->args[0]) {
err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
- nl80211_fam.attrbuf, nl80211_fam.maxattr,
- nl80211_policy);
+ genl_family_attrbuf(&nl80211_fam),
+ nl80211_fam.maxattr, nl80211_policy);
if (err)
- goto out_unlock;
+ return err;
- *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk),
- nl80211_fam.attrbuf);
- if (IS_ERR(*wdev)) {
- err = PTR_ERR(*wdev);
- goto out_unlock;
- }
+ *wdev = __cfg80211_wdev_from_attrs(
+ sock_net(skb->sk),
+ genl_family_attrbuf(&nl80211_fam));
+ if (IS_ERR(*wdev))
+ return PTR_ERR(*wdev);
*rdev = wiphy_to_rdev((*wdev)->wiphy);
/* 0 is the first index - add 1 to parse only once */
cb->args[0] = (*rdev)->wiphy_idx + 1;
@@ -571,10 +566,8 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1);
struct wireless_dev *tmp;
- if (!wiphy) {
- err = -ENODEV;
- goto out_unlock;
- }
+ if (!wiphy)
+ return -ENODEV;
*rdev = wiphy_to_rdev(wiphy);
*wdev = NULL;
@@ -585,21 +578,11 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
}
}
- if (!*wdev) {
- err = -ENODEV;
- goto out_unlock;
- }
+ if (!*wdev)
+ return -ENODEV;
}
return 0;
- out_unlock:
- rtnl_unlock();
- return err;
-}
-
-static void nl80211_finish_wdev_dump(struct cfg80211_registered_device *rdev)
-{
- rtnl_unlock();
}
/* IE validation */
@@ -1075,6 +1058,10 @@ static int nl80211_put_iface_combinations(struct wiphy *wiphy,
nla_put_u32(msg, NL80211_IFACE_COMB_RADAR_DETECT_REGIONS,
c->radar_detect_regions)))
goto nla_put_failure;
+ if (c->beacon_int_min_gcd &&
+ nla_put_u32(msg, NL80211_IFACE_COMB_BI_MIN_GCD,
+ c->beacon_int_min_gcd))
+ goto nla_put_failure;
nla_nest_end(msg, nl_combi);
}
@@ -1322,6 +1309,95 @@ nl80211_send_mgmt_stypes(struct sk_buff *msg,
return 0;
}
+#define CMD(op, n) \
+ do { \
+ if (rdev->ops->op) { \
+ i++; \
+ if (nla_put_u32(msg, i, NL80211_CMD_ ## n)) \
+ goto nla_put_failure; \
+ } \
+ } while (0)
+
+static int nl80211_add_commands_unsplit(struct cfg80211_registered_device *rdev,
+ struct sk_buff *msg)
+{
+ int i = 0;
+
+ /*
+ * do *NOT* add anything into this function, new things need to be
+ * advertised only to new versions of userspace that can deal with
+ * the split (and they can't possibly care about new features...
+ */
+ CMD(add_virtual_intf, NEW_INTERFACE);
+ CMD(change_virtual_intf, SET_INTERFACE);
+ CMD(add_key, NEW_KEY);
+ CMD(start_ap, START_AP);
+ CMD(add_station, NEW_STATION);
+ CMD(add_mpath, NEW_MPATH);
+ CMD(update_mesh_config, SET_MESH_CONFIG);
+ CMD(change_bss, SET_BSS);
+ CMD(auth, AUTHENTICATE);
+ CMD(assoc, ASSOCIATE);
+ CMD(deauth, DEAUTHENTICATE);
+ CMD(disassoc, DISASSOCIATE);
+ CMD(join_ibss, JOIN_IBSS);
+ CMD(join_mesh, JOIN_MESH);
+ CMD(set_pmksa, SET_PMKSA);
+ CMD(del_pmksa, DEL_PMKSA);
+ CMD(flush_pmksa, FLUSH_PMKSA);
+ if (rdev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL)
+ CMD(remain_on_channel, REMAIN_ON_CHANNEL);
+ CMD(set_bitrate_mask, SET_TX_BITRATE_MASK);
+ CMD(mgmt_tx, FRAME);
+ CMD(mgmt_tx_cancel_wait, FRAME_WAIT_CANCEL);
+ if (rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK) {
+ i++;
+ if (nla_put_u32(msg, i, NL80211_CMD_SET_WIPHY_NETNS))
+ goto nla_put_failure;
+ }
+ if (rdev->ops->set_monitor_channel || rdev->ops->start_ap ||
+ rdev->ops->join_mesh) {
+ i++;
+ if (nla_put_u32(msg, i, NL80211_CMD_SET_CHANNEL))
+ goto nla_put_failure;
+ }
+ CMD(set_wds_peer, SET_WDS_PEER);
+ if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) {
+ CMD(tdls_mgmt, TDLS_MGMT);
+ CMD(tdls_oper, TDLS_OPER);
+ }
+ if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN)
+ CMD(sched_scan_start, START_SCHED_SCAN);
+ CMD(probe_client, PROBE_CLIENT);
+ CMD(set_noack_map, SET_NOACK_MAP);
+ if (rdev->wiphy.flags & WIPHY_FLAG_REPORTS_OBSS) {
+ i++;
+ if (nla_put_u32(msg, i, NL80211_CMD_REGISTER_BEACONS))
+ goto nla_put_failure;
+ }
+ CMD(start_p2p_device, START_P2P_DEVICE);
+ CMD(set_mcast_rate, SET_MCAST_RATE);
+#ifdef CONFIG_NL80211_TESTMODE
+ CMD(testmode_cmd, TESTMODE);
+#endif
+
+ if (rdev->ops->connect || rdev->ops->auth) {
+ i++;
+ if (nla_put_u32(msg, i, NL80211_CMD_CONNECT))
+ goto nla_put_failure;
+ }
+
+ if (rdev->ops->disconnect || rdev->ops->deauth) {
+ i++;
+ if (nla_put_u32(msg, i, NL80211_CMD_DISCONNECT))
+ goto nla_put_failure;
+ }
+
+ return i;
+ nla_put_failure:
+ return -ENOBUFS;
+}
+
struct nl80211_dump_wiphy_state {
s64 filter_wiphy;
long start;
@@ -1549,68 +1625,9 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
if (!nl_cmds)
goto nla_put_failure;
- i = 0;
-#define CMD(op, n) \
- do { \
- if (rdev->ops->op) { \
- i++; \
- if (nla_put_u32(msg, i, NL80211_CMD_ ## n)) \
- goto nla_put_failure; \
- } \
- } while (0)
-
- CMD(add_virtual_intf, NEW_INTERFACE);
- CMD(change_virtual_intf, SET_INTERFACE);
- CMD(add_key, NEW_KEY);
- CMD(start_ap, START_AP);
- CMD(add_station, NEW_STATION);
- CMD(add_mpath, NEW_MPATH);
- CMD(update_mesh_config, SET_MESH_CONFIG);
- CMD(change_bss, SET_BSS);
- CMD(auth, AUTHENTICATE);
- CMD(assoc, ASSOCIATE);
- CMD(deauth, DEAUTHENTICATE);
- CMD(disassoc, DISASSOCIATE);
- CMD(join_ibss, JOIN_IBSS);
- CMD(join_mesh, JOIN_MESH);
- CMD(set_pmksa, SET_PMKSA);
- CMD(del_pmksa, DEL_PMKSA);
- CMD(flush_pmksa, FLUSH_PMKSA);
- if (rdev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL)
- CMD(remain_on_channel, REMAIN_ON_CHANNEL);
- CMD(set_bitrate_mask, SET_TX_BITRATE_MASK);
- CMD(mgmt_tx, FRAME);
- CMD(mgmt_tx_cancel_wait, FRAME_WAIT_CANCEL);
- if (rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK) {
- i++;
- if (nla_put_u32(msg, i, NL80211_CMD_SET_WIPHY_NETNS))
- goto nla_put_failure;
- }
- if (rdev->ops->set_monitor_channel || rdev->ops->start_ap ||
- rdev->ops->join_mesh) {
- i++;
- if (nla_put_u32(msg, i, NL80211_CMD_SET_CHANNEL))
- goto nla_put_failure;
- }
- CMD(set_wds_peer, SET_WDS_PEER);
- if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) {
- CMD(tdls_mgmt, TDLS_MGMT);
- CMD(tdls_oper, TDLS_OPER);
- }
- if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN)
- CMD(sched_scan_start, START_SCHED_SCAN);
- CMD(probe_client, PROBE_CLIENT);
- CMD(set_noack_map, SET_NOACK_MAP);
- if (rdev->wiphy.flags & WIPHY_FLAG_REPORTS_OBSS) {
- i++;
- if (nla_put_u32(msg, i, NL80211_CMD_REGISTER_BEACONS))
- goto nla_put_failure;
- }
- CMD(start_p2p_device, START_P2P_DEVICE);
- CMD(set_mcast_rate, SET_MCAST_RATE);
-#ifdef CONFIG_NL80211_TESTMODE
- CMD(testmode_cmd, TESTMODE);
-#endif
+ i = nl80211_add_commands_unsplit(rdev, msg);
+ if (i < 0)
+ goto nla_put_failure;
if (state->split) {
CMD(crit_proto_start, CRIT_PROTOCOL_START);
CMD(crit_proto_stop, CRIT_PROTOCOL_STOP);
@@ -1620,22 +1637,11 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
if (rdev->wiphy.features &
NL80211_FEATURE_SUPPORTS_WMM_ADMISSION)
CMD(add_tx_ts, ADD_TX_TS);
+ CMD(set_multicast_to_unicast, SET_MULTICAST_TO_UNICAST);
+ CMD(update_connect_params, UPDATE_CONNECT_PARAMS);
}
- /* add into the if now */
#undef CMD
- if (rdev->ops->connect || rdev->ops->auth) {
- i++;
- if (nla_put_u32(msg, i, NL80211_CMD_CONNECT))
- goto nla_put_failure;
- }
-
- if (rdev->ops->disconnect || rdev->ops->deauth) {
- i++;
- if (nla_put_u32(msg, i, NL80211_CMD_DISCONNECT))
- goto nla_put_failure;
- }
-
nla_nest_end(msg, nl_cmds);
state->split_start++;
if (state->split)
@@ -1864,6 +1870,10 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
}
}
+ if (nla_put_u32(msg, NL80211_ATTR_BANDS,
+ rdev->wiphy.nan_supported_bands))
+ goto nla_put_failure;
+
/* done */
state->split_start = 0;
break;
@@ -1881,7 +1891,7 @@ static int nl80211_dump_wiphy_parse(struct sk_buff *skb,
struct netlink_callback *cb,
struct nl80211_dump_wiphy_state *state)
{
- struct nlattr **tb = nl80211_fam.attrbuf;
+ struct nlattr **tb = genl_family_attrbuf(&nl80211_fam);
int ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
tb, nl80211_fam.maxattr, nl80211_policy);
/* ignore parse errors for backward compatibility */
@@ -2296,10 +2306,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
nla_for_each_nested(nl_txq_params,
info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS],
rem_txq_params) {
- result = nla_parse(tb, NL80211_TXQ_ATTR_MAX,
- nla_data(nl_txq_params),
- nla_len(nl_txq_params),
- txq_params_policy);
+ result = nla_parse_nested(tb, NL80211_TXQ_ATTR_MAX,
+ nl_txq_params,
+ txq_params_policy);
if (result)
return result;
result = parse_txq_params(tb, &txq_params);
@@ -2583,17 +2592,17 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
int filter_wiphy = -1;
struct cfg80211_registered_device *rdev;
struct wireless_dev *wdev;
+ int ret;
rtnl_lock();
if (!cb->args[2]) {
struct nl80211_dump_wiphy_state state = {
.filter_wiphy = -1,
};
- int ret;
ret = nl80211_dump_wiphy_parse(skb, cb, &state);
if (ret)
- return ret;
+ goto out_unlock;
filter_wiphy = state.filter_wiphy;
@@ -2638,12 +2647,14 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
wp_idx++;
}
out:
- rtnl_unlock();
-
cb->args[0] = wp_idx;
cb->args[1] = if_idx;
- return skb->len;
+ ret = skb->len;
+ out_unlock:
+ rtnl_unlock();
+
+ return ret;
}
static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info)
@@ -3549,8 +3560,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
sband = rdev->wiphy.bands[band];
if (sband == NULL)
return -EINVAL;
- err = nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates),
- nla_len(tx_rates), nl80211_txattr_policy);
+ err = nla_parse_nested(tb, NL80211_TXRATE_MAX, tx_rates,
+ nl80211_txattr_policy);
if (err)
return err;
if (tb[NL80211_TXRATE_LEGACY]) {
@@ -3722,6 +3733,49 @@ static int nl80211_parse_beacon(struct nlattr *attrs[],
return 0;
}
+static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params,
+ const u8 *rates)
+{
+ int i;
+
+ if (!rates)
+ return;
+
+ for (i = 0; i < rates[1]; i++) {
+ if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HT_PHY)
+ params->ht_required = true;
+ if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_VHT_PHY)
+ params->vht_required = true;
+ }
+}
+
+/*
+ * Since the nl80211 API didn't include, from the beginning, attributes about
+ * HT/VHT requirements/capabilities, we parse them out of the IEs for the
+ * benefit of drivers that rebuild IEs in the firmware.
+ */
+static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
+{
+ const struct cfg80211_beacon_data *bcn = &params->beacon;
+ size_t ies_len = bcn->beacon_ies_len;
+ const u8 *ies = bcn->beacon_ies;
+ const u8 *rates;
+ const u8 *cap;
+
+ rates = cfg80211_find_ie(WLAN_EID_SUPP_RATES, ies, ies_len);
+ nl80211_check_ap_rate_selectors(params, rates);
+
+ rates = cfg80211_find_ie(WLAN_EID_EXT_SUPP_RATES, ies, ies_len);
+ nl80211_check_ap_rate_selectors(params, rates);
+
+ cap = cfg80211_find_ie(WLAN_EID_HT_CAPABILITY, ies, ies_len);
+ if (cap && cap[1] >= sizeof(*params->ht_cap))
+ params->ht_cap = (void *)(cap + 2);
+ cap = cfg80211_find_ie(WLAN_EID_VHT_CAPABILITY, ies, ies_len);
+ if (cap && cap[1] >= sizeof(*params->vht_cap))
+ params->vht_cap = (void *)(cap + 2);
+}
+
static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev,
struct cfg80211_ap_settings *params)
{
@@ -3756,12 +3810,23 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) &&
auth_type == NL80211_AUTHTYPE_SAE)
return false;
+ if (!wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_FILS_STA) &&
+ (auth_type == NL80211_AUTHTYPE_FILS_SK ||
+ auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
+ auth_type == NL80211_AUTHTYPE_FILS_PK))
+ return false;
return true;
case NL80211_CMD_CONNECT:
case NL80211_CMD_START_AP:
/* SAE not supported yet */
if (auth_type == NL80211_AUTHTYPE_SAE)
return false;
+ /* FILS not supported yet */
+ if (auth_type == NL80211_AUTHTYPE_FILS_SK ||
+ auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
+ auth_type == NL80211_AUTHTYPE_FILS_PK)
+ return false;
return true;
default:
return false;
@@ -3803,7 +3868,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
params.dtim_period =
nla_get_u32(info->attrs[NL80211_ATTR_DTIM_PERIOD]);
- err = cfg80211_validate_beacon_int(rdev, params.beacon_interval);
+ err = cfg80211_validate_beacon_int(rdev, dev->ieee80211_ptr->iftype,
+ params.beacon_interval);
if (err)
return err;
@@ -3938,6 +4004,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
return PTR_ERR(params.acl);
}
+ nl80211_calculate_ap_params(&params);
+
wdev_lock(wdev);
err = rdev_start_ap(rdev, dev, &params);
if (!err) {
@@ -4370,9 +4438,10 @@ static int nl80211_dump_station(struct sk_buff *skb,
int sta_idx = cb->args[2];
int err;
+ rtnl_lock();
err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
if (err)
- return err;
+ goto out_err;
if (!wdev->netdev) {
err = -EINVAL;
@@ -4407,7 +4476,7 @@ static int nl80211_dump_station(struct sk_buff *skb,
cb->args[2] = sta_idx;
err = skb->len;
out_err:
- nl80211_finish_wdev_dump(rdev);
+ rtnl_unlock();
return err;
}
@@ -4587,6 +4656,15 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
break;
}
+ /*
+ * Older kernel versions ignored this attribute entirely, so don't
+ * reject attempts to update it but mark it as unused instead so the
+ * driver won't look at the data.
+ */
+ if (statype != CFG80211_STA_AP_CLIENT_UNASSOC &&
+ statype != CFG80211_STA_TDLS_PEER_SETUP)
+ params->opmode_notif_used = false;
+
return 0;
}
EXPORT_SYMBOL(cfg80211_check_station_change);
@@ -4826,6 +4904,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
params.local_pm = pm;
}
+ if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
+ params.opmode_notif_used = true;
+ params.opmode_notif =
+ nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]);
+ }
+
/* Include parameters for TDLS peer (will check later) */
err = nl80211_set_station_tdls(info, &params);
if (err)
@@ -5178,9 +5262,10 @@ static int nl80211_dump_mpath(struct sk_buff *skb,
int path_idx = cb->args[2];
int err;
+ rtnl_lock();
err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
if (err)
- return err;
+ goto out_err;
if (!rdev->ops->dump_mpath) {
err = -EOPNOTSUPP;
@@ -5213,7 +5298,7 @@ static int nl80211_dump_mpath(struct sk_buff *skb,
cb->args[2] = path_idx;
err = skb->len;
out_err:
- nl80211_finish_wdev_dump(rdev);
+ rtnl_unlock();
return err;
}
@@ -5373,9 +5458,10 @@ static int nl80211_dump_mpp(struct sk_buff *skb,
int path_idx = cb->args[2];
int err;
+ rtnl_lock();
err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
if (err)
- return err;
+ goto out_err;
if (!rdev->ops->dump_mpp) {
err = -EOPNOTSUPP;
@@ -5408,7 +5494,7 @@ static int nl80211_dump_mpp(struct sk_buff *skb,
cb->args[2] = path_idx;
err = skb->len;
out_err:
- nl80211_finish_wdev_dump(rdev);
+ rtnl_unlock();
return err;
}
@@ -5873,6 +5959,7 @@ do { \
break;
}
cfg->ht_opmode = ht_opmode;
+ mask |= (1 << (NL80211_MESHCONF_HT_OPMODE - 1));
}
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathToRootTimeout,
1, 65535, mask,
@@ -6305,9 +6392,8 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES],
rem_reg_rules) {
- r = nla_parse(tb, NL80211_REG_RULE_ATTR_MAX,
- nla_data(nl_reg_rule), nla_len(nl_reg_rule),
- reg_rule_policy);
+ r = nla_parse_nested(tb, NL80211_REG_RULE_ATTR_MAX,
+ nl_reg_rule, reg_rule_policy);
if (r)
goto bad_reg;
r = parse_reg_rule(tb, &rd->reg_rules[rule_idx]);
@@ -6374,8 +6460,8 @@ static int parse_bss_select(struct nlattr *nla, struct wiphy *wiphy,
if (!nla_ok(nest, nla_len(nest)))
return -EINVAL;
- err = nla_parse(attr, NL80211_BSS_SELECT_ATTR_MAX, nla_data(nest),
- nla_len(nest), nl80211_bss_select_policy);
+ err = nla_parse_nested(attr, NL80211_BSS_SELECT_ATTR_MAX, nest,
+ nl80211_bss_select_policy);
if (err)
return err;
@@ -6677,7 +6763,20 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
request->no_cck =
nla_get_flag(info->attrs[NL80211_ATTR_TX_NO_CCK_RATE]);
- if (info->attrs[NL80211_ATTR_MAC])
+ /* Initial implementation used NL80211_ATTR_MAC to set the specific
+ * BSSID to scan for. This was problematic because that same attribute
+ * was already used for another purpose (local random MAC address). The
+ * NL80211_ATTR_BSSID attribute was added to fix this. For backwards
+ * compatibility with older userspace components, also use the
+ * NL80211_ATTR_MAC value here if it can be determined to be used for
+ * the specific BSSID use case instead of the random MAC address
+ * (NL80211_ATTR_SCAN_FLAGS is used to enable random MAC address use).
+ */
+ if (info->attrs[NL80211_ATTR_BSSID])
+ memcpy(request->bssid,
+ nla_data(info->attrs[NL80211_ATTR_BSSID]), ETH_ALEN);
+ else if (!(request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) &&
+ info->attrs[NL80211_ATTR_MAC])
memcpy(request->bssid, nla_data(info->attrs[NL80211_ATTR_MAC]),
ETH_ALEN);
else
@@ -6735,13 +6834,10 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans,
/*
* If scan plans are not specified,
- * %NL80211_ATTR_SCHED_SCAN_INTERVAL must be specified. In this
+ * %NL80211_ATTR_SCHED_SCAN_INTERVAL will be specified. In this
* case one scan plan will be set with the specified scan
* interval and infinite number of iterations.
*/
- if (!attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL])
- return -EINVAL;
-
interval = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]);
if (!interval)
return -EINVAL;
@@ -6765,9 +6861,8 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans,
if (WARN_ON(i >= n_plans))
return -EINVAL;
- err = nla_parse(plan, NL80211_SCHED_SCAN_PLAN_MAX,
- nla_data(attr), nla_len(attr),
- nl80211_plan_policy);
+ err = nla_parse_nested(plan, NL80211_SCHED_SCAN_PLAN_MAX,
+ attr, nl80211_plan_policy);
if (err)
return err;
@@ -6811,7 +6906,7 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans,
static struct cfg80211_sched_scan_request *
nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
- struct nlattr **attrs)
+ struct nlattr **attrs, int max_match_sets)
{
struct cfg80211_sched_scan_request *request;
struct nlattr *attr;
@@ -6856,9 +6951,9 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
tmp) {
struct nlattr *rssi;
- err = nla_parse(tb, NL80211_SCHED_SCAN_MATCH_ATTR_MAX,
- nla_data(attr), nla_len(attr),
- nl80211_match_policy);
+ err = nla_parse_nested(tb,
+ NL80211_SCHED_SCAN_MATCH_ATTR_MAX,
+ attr, nl80211_match_policy);
if (err)
return ERR_PTR(err);
/* add other standalone attributes here */
@@ -6876,7 +6971,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
if (!n_match_sets && default_match_rssi != NL80211_SCAN_RSSI_THOLD_OFF)
n_match_sets = 1;
- if (n_match_sets > wiphy->max_match_sets)
+ if (n_match_sets > max_match_sets)
return ERR_PTR(-EINVAL);
if (attrs[NL80211_ATTR_IE])
@@ -6914,6 +7009,12 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
if (!n_plans || n_plans > wiphy->max_sched_scan_plans)
return ERR_PTR(-EINVAL);
+ if (!wiphy_ext_feature_isset(
+ wiphy, NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI) &&
+ (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] ||
+ attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]))
+ return ERR_PTR(-EINVAL);
+
request = kzalloc(sizeof(*request)
+ sizeof(*request->ssids) * n_ssids
+ sizeof(*request->match_sets) * n_match_sets
@@ -7029,9 +7130,9 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
tmp) {
struct nlattr *ssid, *rssi;
- err = nla_parse(tb, NL80211_SCHED_SCAN_MATCH_ATTR_MAX,
- nla_data(attr), nla_len(attr),
- nl80211_match_policy);
+ err = nla_parse_nested(tb,
+ NL80211_SCHED_SCAN_MATCH_ATTR_MAX,
+ attr, nl80211_match_policy);
if (err)
goto out_free;
ssid = tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID];
@@ -7120,6 +7221,26 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
request->delay =
nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_DELAY]);
+ if (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]) {
+ request->relative_rssi = nla_get_s8(
+ attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]);
+ request->relative_rssi_set = true;
+ }
+
+ if (request->relative_rssi_set &&
+ attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]) {
+ struct nl80211_bss_select_rssi_adjust *rssi_adjust;
+
+ rssi_adjust = nla_data(
+ attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]);
+ request->rssi_adjust.band = rssi_adjust->band;
+ request->rssi_adjust.delta = rssi_adjust->delta;
+ if (!is_band_valid(wiphy, request->rssi_adjust.band)) {
+ err = -EINVAL;
+ goto out_free;
+ }
+ }
+
err = nl80211_parse_sched_scan_plans(wiphy, n_plans, request, attrs);
if (err)
goto out_free;
@@ -7150,7 +7271,8 @@ static int nl80211_start_sched_scan(struct sk_buff *skb,
return -EINPROGRESS;
sched_scan_req = nl80211_parse_sched_scan(&rdev->wiphy, wdev,
- info->attrs);
+ info->attrs,
+ rdev->wiphy.max_match_sets);
err = PTR_ERR_OR_ZERO(sched_scan_req);
if (err)
@@ -7541,9 +7663,12 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb)
int start = cb->args[2], idx = 0;
int err;
+ rtnl_lock();
err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
- if (err)
+ if (err) {
+ rtnl_unlock();
return err;
+ }
wdev_lock(wdev);
spin_lock_bh(&rdev->bss_lock);
@@ -7566,7 +7691,7 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb)
wdev_unlock(wdev);
cb->args[2] = idx;
- nl80211_finish_wdev_dump(rdev);
+ rtnl_unlock();
return skb->len;
}
@@ -7643,6 +7768,7 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq,
static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam);
struct survey_info survey;
struct cfg80211_registered_device *rdev;
struct wireless_dev *wdev;
@@ -7650,12 +7776,13 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
int res;
bool radio_stats;
+ rtnl_lock();
res = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
if (res)
- return res;
+ goto out_err;
/* prepare_wdev_dump parsed the attributes */
- radio_stats = nl80211_fam.attrbuf[NL80211_ATTR_SURVEY_RADIO_STATS];
+ radio_stats = attrbuf[NL80211_ATTR_SURVEY_RADIO_STATS];
if (!wdev->netdev) {
res = -EINVAL;
@@ -7693,7 +7820,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
cb->args[2] = survey_idx;
res = skb->len;
out_err:
- nl80211_finish_wdev_dump(rdev);
+ rtnl_unlock();
return res;
}
@@ -7708,8 +7835,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct net_device *dev = info->user_ptr[1];
struct ieee80211_channel *chan;
- const u8 *bssid, *ssid, *ie = NULL, *sae_data = NULL;
- int err, ssid_len, ie_len = 0, sae_data_len = 0;
+ const u8 *bssid, *ssid, *ie = NULL, *auth_data = NULL;
+ int err, ssid_len, ie_len = 0, auth_data_len = 0;
enum nl80211_auth_type auth_type;
struct key_parse key;
bool local_state_change;
@@ -7789,17 +7916,23 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
if (!nl80211_valid_auth_type(rdev, auth_type, NL80211_CMD_AUTHENTICATE))
return -EINVAL;
- if (auth_type == NL80211_AUTHTYPE_SAE &&
- !info->attrs[NL80211_ATTR_SAE_DATA])
+ if ((auth_type == NL80211_AUTHTYPE_SAE ||
+ auth_type == NL80211_AUTHTYPE_FILS_SK ||
+ auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
+ auth_type == NL80211_AUTHTYPE_FILS_PK) &&
+ !info->attrs[NL80211_ATTR_AUTH_DATA])
return -EINVAL;
- if (info->attrs[NL80211_ATTR_SAE_DATA]) {
- if (auth_type != NL80211_AUTHTYPE_SAE)
+ if (info->attrs[NL80211_ATTR_AUTH_DATA]) {
+ if (auth_type != NL80211_AUTHTYPE_SAE &&
+ auth_type != NL80211_AUTHTYPE_FILS_SK &&
+ auth_type != NL80211_AUTHTYPE_FILS_SK_PFS &&
+ auth_type != NL80211_AUTHTYPE_FILS_PK)
return -EINVAL;
- sae_data = nla_data(info->attrs[NL80211_ATTR_SAE_DATA]);
- sae_data_len = nla_len(info->attrs[NL80211_ATTR_SAE_DATA]);
+ auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]);
+ auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]);
/* need to include at least Auth Transaction and Status Code */
- if (sae_data_len < 4)
+ if (auth_data_len < 4)
return -EINVAL;
}
@@ -7816,7 +7949,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
err = cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid,
ssid, ssid_len, ie, ie_len,
key.p.key, key.p.key_len, key.idx,
- sae_data, sae_data_len);
+ auth_data, auth_data_len);
wdev_unlock(dev->ieee80211_ptr);
return err;
}
@@ -7995,11 +8128,29 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
req.flags |= ASSOC_REQ_USE_RRM;
}
+ if (info->attrs[NL80211_ATTR_FILS_KEK]) {
+ req.fils_kek = nla_data(info->attrs[NL80211_ATTR_FILS_KEK]);
+ req.fils_kek_len = nla_len(info->attrs[NL80211_ATTR_FILS_KEK]);
+ if (!info->attrs[NL80211_ATTR_FILS_NONCES])
+ return -EINVAL;
+ req.fils_nonces =
+ nla_data(info->attrs[NL80211_ATTR_FILS_NONCES]);
+ }
+
err = nl80211_crypto_settings(rdev, info, &req.crypto, 1);
if (!err) {
wdev_lock(dev->ieee80211_ptr);
+
err = cfg80211_mlme_assoc(rdev, dev, chan, bssid,
ssid, ssid_len, &req);
+
+ if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
+ dev->ieee80211_ptr->conn_owner_nlportid =
+ info->snd_portid;
+ memcpy(dev->ieee80211_ptr->disconnect_bssid,
+ bssid, ETH_ALEN);
+ }
+
wdev_unlock(dev->ieee80211_ptr);
}
@@ -8152,7 +8303,8 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
ibss.beacon_interval =
nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]);
- err = cfg80211_validate_beacon_int(rdev, ibss.beacon_interval);
+ err = cfg80211_validate_beacon_int(rdev, NL80211_IFTYPE_ADHOC,
+ ibss.beacon_interval);
if (err)
return err;
@@ -8477,25 +8629,29 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
* so we need to offset by 1.
*/
phy_idx = cb->args[0] - 1;
+
+ rdev = cfg80211_rdev_by_wiphy_idx(phy_idx);
+ if (!rdev) {
+ err = -ENOENT;
+ goto out_err;
+ }
} else {
+ struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam);
+
err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
- nl80211_fam.attrbuf, nl80211_fam.maxattr,
- nl80211_policy);
+ attrbuf, nl80211_fam.maxattr, nl80211_policy);
if (err)
goto out_err;
- rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk),
- nl80211_fam.attrbuf);
+ rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), attrbuf);
if (IS_ERR(rdev)) {
err = PTR_ERR(rdev);
goto out_err;
}
phy_idx = rdev->wiphy_idx;
- rdev = NULL;
- if (nl80211_fam.attrbuf[NL80211_ATTR_TESTDATA])
- cb->args[1] =
- (long)nl80211_fam.attrbuf[NL80211_ATTR_TESTDATA];
+ if (attrbuf[NL80211_ATTR_TESTDATA])
+ cb->args[1] = (long)attrbuf[NL80211_ATTR_TESTDATA];
}
if (cb->args[1]) {
@@ -8503,12 +8659,6 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
data_len = nla_len((void *)cb->args[1]);
}
- rdev = cfg80211_rdev_by_wiphy_idx(phy_idx);
- if (!rdev) {
- err = -ENOENT;
- goto out_err;
- }
-
if (!rdev->ops->testmode_dump) {
err = -EOPNOTSUPP;
goto out_err;
@@ -8718,14 +8868,58 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
}
wdev_lock(dev->ieee80211_ptr);
+
err = cfg80211_connect(rdev, dev, &connect, connkeys,
connect.prev_bssid);
- wdev_unlock(dev->ieee80211_ptr);
if (err)
kzfree(connkeys);
+
+ if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
+ dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid;
+ if (connect.bssid)
+ memcpy(dev->ieee80211_ptr->disconnect_bssid,
+ connect.bssid, ETH_ALEN);
+ else
+ memset(dev->ieee80211_ptr->disconnect_bssid,
+ 0, ETH_ALEN);
+ }
+
+ wdev_unlock(dev->ieee80211_ptr);
+
return err;
}
+static int nl80211_update_connect_params(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_connect_params connect = {};
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ u32 changed = 0;
+ int ret;
+
+ if (!rdev->ops->update_connect_params)
+ return -EOPNOTSUPP;
+
+ if (info->attrs[NL80211_ATTR_IE]) {
+ if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+ return -EINVAL;
+ connect.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+ connect.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+ changed |= UPDATE_ASSOC_IES;
+ }
+
+ wdev_lock(dev->ieee80211_ptr);
+ if (!wdev->current_bss)
+ ret = -ENOLINK;
+ else
+ ret = rdev_update_connect_params(rdev, dev, &connect, changed);
+ wdev_unlock(dev->ieee80211_ptr);
+
+ return ret;
+}
+
static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
{
struct cfg80211_registered_device *rdev = info->user_ptr[0];
@@ -9278,6 +9472,7 @@ nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] = {
[NL80211_ATTR_CQM_TXE_RATE] = { .type = NLA_U32 },
[NL80211_ATTR_CQM_TXE_PKTS] = { .type = NLA_U32 },
[NL80211_ATTR_CQM_TXE_INTVL] = { .type = NLA_U32 },
+ [NL80211_ATTR_CQM_RSSI_LEVEL] = { .type = NLA_S32 },
};
static int nl80211_set_cqm_txe(struct genl_info *info,
@@ -9417,7 +9612,9 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
setup.beacon_interval =
nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]);
- err = cfg80211_validate_beacon_int(rdev, setup.beacon_interval);
+ err = cfg80211_validate_beacon_int(rdev,
+ NL80211_IFTYPE_MESH_POINT,
+ setup.beacon_interval);
if (err)
return err;
}
@@ -9585,6 +9782,20 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_DELAY, req->delay))
return -ENOBUFS;
+ if (req->relative_rssi_set) {
+ struct nl80211_bss_select_rssi_adjust rssi_adjust;
+
+ if (nla_put_s8(msg, NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI,
+ req->relative_rssi))
+ return -ENOBUFS;
+
+ rssi_adjust.band = req->rssi_adjust.band;
+ rssi_adjust.delta = req->rssi_adjust.delta;
+ if (nla_put(msg, NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST,
+ sizeof(rssi_adjust), &rssi_adjust))
+ return -ENOBUFS;
+ }
+
freqs = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQUENCIES);
if (!freqs)
return -ENOBUFS;
@@ -9728,9 +9939,8 @@ static int nl80211_parse_wowlan_tcp(struct cfg80211_registered_device *rdev,
if (!rdev->wiphy.wowlan->tcp)
return -EINVAL;
- err = nla_parse(tb, MAX_NL80211_WOWLAN_TCP,
- nla_data(attr), nla_len(attr),
- nl80211_wowlan_tcp_policy);
+ err = nla_parse_nested(tb, MAX_NL80211_WOWLAN_TCP, attr,
+ nl80211_wowlan_tcp_policy);
if (err)
return err;
@@ -9875,13 +10085,12 @@ static int nl80211_parse_wowlan_nd(struct cfg80211_registered_device *rdev,
goto out;
}
- err = nla_parse(tb, NL80211_ATTR_MAX,
- nla_data(attr), nla_len(attr),
- nl80211_policy);
+ err = nla_parse_nested(tb, NL80211_ATTR_MAX, attr, nl80211_policy);
if (err)
goto out;
- trig->nd_config = nl80211_parse_sched_scan(&rdev->wiphy, NULL, tb);
+ trig->nd_config = nl80211_parse_sched_scan(&rdev->wiphy, NULL, tb,
+ wowlan->max_nd_match_sets);
err = PTR_ERR_OR_ZERO(trig->nd_config);
if (err)
trig->nd_config = NULL;
@@ -9911,10 +10120,9 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
goto set_wakeup;
}
- err = nla_parse(tb, MAX_NL80211_WOWLAN_TRIG,
- nla_data(info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS]),
- nla_len(info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS]),
- nl80211_wowlan_policy);
+ err = nla_parse_nested(tb, MAX_NL80211_WOWLAN_TRIG,
+ info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS],
+ nl80211_wowlan_policy);
if (err)
return err;
@@ -9996,8 +10204,8 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
rem) {
u8 *mask_pat;
- nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat),
- nla_len(pat), NULL);
+ nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat,
+ NULL);
err = -EINVAL;
if (!pat_tb[NL80211_PKTPAT_MASK] ||
!pat_tb[NL80211_PKTPAT_PATTERN])
@@ -10207,8 +10415,8 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev,
int rem, pat_len, mask_len, pkt_offset, n_patterns = 0;
struct nlattr *pat_tb[NUM_NL80211_PKTPAT];
- err = nla_parse(tb, NL80211_ATTR_COALESCE_RULE_MAX, nla_data(rule),
- nla_len(rule), nl80211_coalesce_policy);
+ err = nla_parse_nested(tb, NL80211_ATTR_COALESCE_RULE_MAX, rule,
+ nl80211_coalesce_policy);
if (err)
return err;
@@ -10246,8 +10454,7 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev,
rem) {
u8 *mask_pat;
- nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat),
- nla_len(pat), NULL);
+ nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, NULL);
if (!pat_tb[NL80211_PKTPAT_MASK] ||
!pat_tb[NL80211_PKTPAT_PATTERN])
return -EINVAL;
@@ -10366,10 +10573,9 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info)
if (!info->attrs[NL80211_ATTR_REKEY_DATA])
return -EINVAL;
- err = nla_parse(tb, MAX_NL80211_REKEY_DATA,
- nla_data(info->attrs[NL80211_ATTR_REKEY_DATA]),
- nla_len(info->attrs[NL80211_ATTR_REKEY_DATA]),
- nl80211_rekey_policy);
+ err = nla_parse_nested(tb, MAX_NL80211_REKEY_DATA,
+ info->attrs[NL80211_ATTR_REKEY_DATA],
+ nl80211_rekey_policy);
if (err)
return err;
@@ -10518,7 +10724,7 @@ static int nl80211_start_p2p_device(struct sk_buff *skb, struct genl_info *info)
if (wdev->iftype != NL80211_IFTYPE_P2P_DEVICE)
return -EOPNOTSUPP;
- if (wdev->p2p_started)
+ if (wdev_running(wdev))
return 0;
if (rfkill_blocked(rdev->rfkill))
@@ -10528,7 +10734,7 @@ static int nl80211_start_p2p_device(struct sk_buff *skb, struct genl_info *info)
if (err)
return err;
- wdev->p2p_started = true;
+ wdev->is_running = true;
rdev->opencount++;
return 0;
@@ -10560,7 +10766,7 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
if (wdev->iftype != NL80211_IFTYPE_NAN)
return -EOPNOTSUPP;
- if (wdev->nan_started)
+ if (wdev_running(wdev))
return -EEXIST;
if (rfkill_blocked(rdev->rfkill))
@@ -10569,21 +10775,28 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF])
return -EINVAL;
- if (!info->attrs[NL80211_ATTR_NAN_DUAL])
- return -EINVAL;
-
conf.master_pref =
nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
if (!conf.master_pref)
return -EINVAL;
- conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]);
+ if (info->attrs[NL80211_ATTR_BANDS]) {
+ u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);
+
+ if (bands & ~(u32)wdev->wiphy->nan_supported_bands)
+ return -EOPNOTSUPP;
+
+ if (bands && !(bands & BIT(NL80211_BAND_2GHZ)))
+ return -EINVAL;
+
+ conf.bands = bands;
+ }
err = rdev_start_nan(rdev, wdev, &conf);
if (err)
return err;
- wdev->nan_started = true;
+ wdev->is_running = true;
rdev->opencount++;
return 0;
@@ -10638,8 +10851,7 @@ static int handle_nan_filter(struct nlattr *attr_filter,
i = 0;
nla_for_each_nested(attr, attr_filter, rem) {
- filter[i].filter = kmemdup(nla_data(attr), nla_len(attr),
- GFP_KERNEL);
+ filter[i].filter = nla_memdup(attr, GFP_KERNEL);
filter[i].len = nla_len(attr);
i++;
}
@@ -10668,7 +10880,7 @@ static int nl80211_nan_add_func(struct sk_buff *skb,
if (wdev->iftype != NL80211_IFTYPE_NAN)
return -EOPNOTSUPP;
- if (!wdev->nan_started)
+ if (!wdev_running(wdev))
return -ENOTCONN;
if (!info->attrs[NL80211_ATTR_NAN_FUNC])
@@ -10678,10 +10890,9 @@ static int nl80211_nan_add_func(struct sk_buff *skb,
wdev->owner_nlportid != info->snd_portid)
return -ENOTCONN;
- err = nla_parse(tb, NL80211_NAN_FUNC_ATTR_MAX,
- nla_data(info->attrs[NL80211_ATTR_NAN_FUNC]),
- nla_len(info->attrs[NL80211_ATTR_NAN_FUNC]),
- nl80211_nan_func_policy);
+ err = nla_parse_nested(tb, NL80211_NAN_FUNC_ATTR_MAX,
+ info->attrs[NL80211_ATTR_NAN_FUNC],
+ nl80211_nan_func_policy);
if (err)
return err;
@@ -10776,9 +10987,9 @@ static int nl80211_nan_add_func(struct sk_buff *skb,
if (tb[NL80211_NAN_FUNC_SRF]) {
struct nlattr *srf_tb[NUM_NL80211_NAN_SRF_ATTR];
- err = nla_parse(srf_tb, NL80211_NAN_SRF_ATTR_MAX,
- nla_data(tb[NL80211_NAN_FUNC_SRF]),
- nla_len(tb[NL80211_NAN_FUNC_SRF]), NULL);
+ err = nla_parse_nested(srf_tb, NL80211_NAN_SRF_ATTR_MAX,
+ tb[NL80211_NAN_FUNC_SRF],
+ nl80211_nan_srf_policy);
if (err)
goto out;
@@ -10904,7 +11115,7 @@ static int nl80211_nan_del_func(struct sk_buff *skb,
if (wdev->iftype != NL80211_IFTYPE_NAN)
return -EOPNOTSUPP;
- if (!wdev->nan_started)
+ if (!wdev_running(wdev))
return -ENOTCONN;
if (!info->attrs[NL80211_ATTR_COOKIE])
@@ -10932,7 +11143,7 @@ static int nl80211_nan_change_config(struct sk_buff *skb,
if (wdev->iftype != NL80211_IFTYPE_NAN)
return -EOPNOTSUPP;
- if (!wdev->nan_started)
+ if (!wdev_running(wdev))
return -ENOTCONN;
if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) {
@@ -10944,9 +11155,17 @@ static int nl80211_nan_change_config(struct sk_buff *skb,
changed |= CFG80211_NAN_CONF_CHANGED_PREF;
}
- if (info->attrs[NL80211_ATTR_NAN_DUAL]) {
- conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]);
- changed |= CFG80211_NAN_CONF_CHANGED_DUAL;
+ if (info->attrs[NL80211_ATTR_BANDS]) {
+ u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);
+
+ if (bands & ~(u32)wdev->wiphy->nan_supported_bands)
+ return -EOPNOTSUPP;
+
+ if (bands && !(bands & BIT(NL80211_BAND_2GHZ)))
+ return -EINVAL;
+
+ conf.bands = bands;
+ changed |= CFG80211_NAN_CONF_CHANGED_BANDS;
}
if (!changed)
@@ -11244,10 +11463,7 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) {
- if (wdev->netdev &&
- !netif_running(wdev->netdev))
- return -ENETDOWN;
- if (!wdev->netdev && !wdev->p2p_started)
+ if (!wdev_running(wdev))
return -ENETDOWN;
}
@@ -11277,6 +11493,7 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
struct cfg80211_registered_device **rdev,
struct wireless_dev **wdev)
{
+ struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam);
u32 vid, subcmd;
unsigned int i;
int vcmd_idx = -1;
@@ -11284,17 +11501,13 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
void *data = NULL;
unsigned int data_len = 0;
- rtnl_lock();
-
if (cb->args[0]) {
/* subtract the 1 again here */
struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1);
struct wireless_dev *tmp;
- if (!wiphy) {
- err = -ENODEV;
- goto out_unlock;
- }
+ if (!wiphy)
+ return -ENODEV;
*rdev = wiphy_to_rdev(wiphy);
*wdev = NULL;
@@ -11312,31 +11525,24 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
}
err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
- nl80211_fam.attrbuf, nl80211_fam.maxattr,
- nl80211_policy);
+ attrbuf, nl80211_fam.maxattr, nl80211_policy);
if (err)
- goto out_unlock;
+ return err;
- if (!nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_ID] ||
- !nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_SUBCMD]) {
- err = -EINVAL;
- goto out_unlock;
- }
+ if (!attrbuf[NL80211_ATTR_VENDOR_ID] ||
+ !attrbuf[NL80211_ATTR_VENDOR_SUBCMD])
+ return -EINVAL;
- *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk),
- nl80211_fam.attrbuf);
+ *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk), attrbuf);
if (IS_ERR(*wdev))
*wdev = NULL;
- *rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk),
- nl80211_fam.attrbuf);
- if (IS_ERR(*rdev)) {
- err = PTR_ERR(*rdev);
- goto out_unlock;
- }
+ *rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), attrbuf);
+ if (IS_ERR(*rdev))
+ return PTR_ERR(*rdev);
- vid = nla_get_u32(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_ID]);
- subcmd = nla_get_u32(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_SUBCMD]);
+ vid = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_ID]);
+ subcmd = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_SUBCMD]);
for (i = 0; i < (*rdev)->wiphy.n_vendor_commands; i++) {
const struct wiphy_vendor_command *vcmd;
@@ -11346,23 +11552,19 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
if (vcmd->info.vendor_id != vid || vcmd->info.subcmd != subcmd)
continue;
- if (!vcmd->dumpit) {
- err = -EOPNOTSUPP;
- goto out_unlock;
- }
+ if (!vcmd->dumpit)
+ return -EOPNOTSUPP;
vcmd_idx = i;
break;
}
- if (vcmd_idx < 0) {
- err = -EOPNOTSUPP;
- goto out_unlock;
- }
+ if (vcmd_idx < 0)
+ return -EOPNOTSUPP;
- if (nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]) {
- data = nla_data(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]);
- data_len = nla_len(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]);
+ if (attrbuf[NL80211_ATTR_VENDOR_DATA]) {
+ data = nla_data(attrbuf[NL80211_ATTR_VENDOR_DATA]);
+ data_len = nla_len(attrbuf[NL80211_ATTR_VENDOR_DATA]);
}
/* 0 is the first index - add 1 to parse only once */
@@ -11375,9 +11577,6 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
/* keep rtnl locked in successful case */
return 0;
- out_unlock:
- rtnl_unlock();
- return err;
}
static int nl80211_vendor_cmd_dump(struct sk_buff *skb,
@@ -11392,9 +11591,10 @@ static int nl80211_vendor_cmd_dump(struct sk_buff *skb,
int err;
struct nlattr *vendor_data;
+ rtnl_lock();
err = nl80211_prepare_vendor_dump(skb, cb, &rdev, &wdev);
if (err)
- return err;
+ goto out;
vcmd_idx = cb->args[2];
data = (void *)cb->args[3];
@@ -11403,18 +11603,21 @@ static int nl80211_vendor_cmd_dump(struct sk_buff *skb,
if (vcmd->flags & (WIPHY_VENDOR_CMD_NEED_WDEV |
WIPHY_VENDOR_CMD_NEED_NETDEV)) {
- if (!wdev)
- return -EINVAL;
+ if (!wdev) {
+ err = -EINVAL;
+ goto out;
+ }
if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_NETDEV &&
- !wdev->netdev)
- return -EINVAL;
+ !wdev->netdev) {
+ err = -EINVAL;
+ goto out;
+ }
if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) {
- if (wdev->netdev &&
- !netif_running(wdev->netdev))
- return -ENETDOWN;
- if (!wdev->netdev && !wdev->p2p_started)
- return -ENETDOWN;
+ if (!wdev_running(wdev)) {
+ err = -ENETDOWN;
+ goto out;
+ }
}
}
@@ -11726,6 +11929,28 @@ static int nl80211_tdls_cancel_channel_switch(struct sk_buff *skb,
return 0;
}
+static int nl80211_set_multicast_to_unicast(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ const struct nlattr *nla;
+ bool enabled;
+
+ if (!rdev->ops->set_multicast_to_unicast)
+ return -EOPNOTSUPP;
+
+ if (wdev->iftype != NL80211_IFTYPE_AP &&
+ wdev->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EOPNOTSUPP;
+
+ nla = info->attrs[NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED];
+ enabled = nla_get_flag(nla);
+
+ return rdev_set_multicast_to_unicast(rdev, dev, enabled);
+}
+
#define NL80211_FLAG_NEED_WIPHY 0x01
#define NL80211_FLAG_NEED_NETDEV 0x02
#define NL80211_FLAG_NEED_RTNL 0x04
@@ -11784,29 +12009,15 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
info->user_ptr[1] = wdev;
}
- if (dev) {
- if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP &&
- !netif_running(dev)) {
- if (rtnl)
- rtnl_unlock();
- return -ENETDOWN;
- }
+ if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP &&
+ !wdev_running(wdev)) {
+ if (rtnl)
+ rtnl_unlock();
+ return -ENETDOWN;
+ }
+ if (dev)
dev_hold(dev);
- } else if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP) {
- if (wdev->iftype == NL80211_IFTYPE_P2P_DEVICE &&
- !wdev->p2p_started) {
- if (rtnl)
- rtnl_unlock();
- return -ENETDOWN;
- }
- if (wdev->iftype == NL80211_IFTYPE_NAN &&
- !wdev->nan_started) {
- if (rtnl)
- rtnl_unlock();
- return -ENETDOWN;
- }
- }
info->user_ptr[0] = rdev;
}
@@ -12179,6 +12390,14 @@ static const struct genl_ops nl80211_ops[] = {
NL80211_FLAG_NEED_RTNL,
},
{
+ .cmd = NL80211_CMD_UPDATE_CONNECT_PARAMS,
+ .doit = nl80211_update_connect_params,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
.cmd = NL80211_CMD_DISCONNECT,
.doit = nl80211_disconnect,
.policy = nl80211_policy,
@@ -12599,6 +12818,29 @@ static const struct genl_ops nl80211_ops[] = {
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
+ {
+ .cmd = NL80211_CMD_SET_MULTICAST_TO_UNICAST,
+ .doit = nl80211_set_multicast_to_unicast,
+ .policy = nl80211_policy,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+};
+
+static struct genl_family nl80211_fam __ro_after_init = {
+ .name = NL80211_GENL_NAME, /* have users key off the name instead */
+ .hdrsize = 0, /* no private header */
+ .version = 1, /* no particular meaning now */
+ .maxattr = NL80211_ATTR_MAX,
+ .netnsok = true,
+ .pre_doit = nl80211_pre_doit,
+ .post_doit = nl80211_post_doit,
+ .module = THIS_MODULE,
+ .ops = nl80211_ops,
+ .n_ops = ARRAY_SIZE(nl80211_ops),
+ .mcgrps = nl80211_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(nl80211_mcgrps),
};
/* notification functions */
@@ -12696,7 +12938,7 @@ static int nl80211_add_scan_req(struct sk_buff *msg,
return -ENOBUFS;
}
-static int nl80211_send_scan_msg(struct sk_buff *msg,
+static int nl80211_prep_scan_msg(struct sk_buff *msg,
struct cfg80211_registered_device *rdev,
struct wireless_dev *wdev,
u32 portid, u32 seq, int flags,
@@ -12727,7 +12969,7 @@ static int nl80211_send_scan_msg(struct sk_buff *msg,
}
static int
-nl80211_send_sched_scan_msg(struct sk_buff *msg,
+nl80211_prep_sched_scan_msg(struct sk_buff *msg,
struct cfg80211_registered_device *rdev,
struct net_device *netdev,
u32 portid, u32 seq, int flags, u32 cmd)
@@ -12759,7 +13001,7 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
if (!msg)
return;
- if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0,
+ if (nl80211_prep_scan_msg(msg, rdev, wdev, 0, 0, 0,
NL80211_CMD_TRIGGER_SCAN) < 0) {
nlmsg_free(msg);
return;
@@ -12778,7 +13020,7 @@ struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
if (!msg)
return NULL;
- if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0,
+ if (nl80211_prep_scan_msg(msg, rdev, wdev, 0, 0, 0,
aborted ? NL80211_CMD_SCAN_ABORTED :
NL80211_CMD_NEW_SCAN_RESULTS) < 0) {
nlmsg_free(msg);
@@ -12788,31 +13030,13 @@ struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
return msg;
}
-void nl80211_send_scan_result(struct cfg80211_registered_device *rdev,
- struct sk_buff *msg)
-{
- if (!msg)
- return;
-
- genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
- NL80211_MCGRP_SCAN, GFP_KERNEL);
-}
-
-void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev,
- struct net_device *netdev)
+/* send message created by nl80211_build_scan_msg() */
+void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev,
+ struct sk_buff *msg)
{
- struct sk_buff *msg;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return;
- if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0,
- NL80211_CMD_SCHED_SCAN_RESULTS) < 0) {
- nlmsg_free(msg);
- return;
- }
-
genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
NL80211_MCGRP_SCAN, GFP_KERNEL);
}
@@ -12826,7 +13050,7 @@ void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
if (!msg)
return;
- if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) {
+ if (nl80211_prep_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) {
nlmsg_free(msg);
return;
}
@@ -12928,7 +13152,7 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev,
struct sk_buff *msg;
void *hdr;
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ msg = nlmsg_new(100 + len, gfp);
if (!msg)
return;
@@ -13075,12 +13299,14 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
struct net_device *netdev, const u8 *bssid,
const u8 *req_ie, size_t req_ie_len,
const u8 *resp_ie, size_t resp_ie_len,
- int status, gfp_t gfp)
+ int status,
+ enum nl80211_timeout_reason timeout_reason,
+ gfp_t gfp)
{
struct sk_buff *msg;
void *hdr;
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp);
if (!msg)
return;
@@ -13096,7 +13322,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
nla_put_u16(msg, NL80211_ATTR_STATUS_CODE,
status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE :
status) ||
- (status < 0 && nla_put_flag(msg, NL80211_ATTR_TIMED_OUT)) ||
+ (status < 0 &&
+ (nla_put_flag(msg, NL80211_ATTR_TIMED_OUT) ||
+ nla_put_u32(msg, NL80211_ATTR_TIMEOUT_REASON, timeout_reason))) ||
(req_ie &&
nla_put(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie)) ||
(resp_ie &&
@@ -13122,7 +13350,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
struct sk_buff *msg;
void *hdr;
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp);
if (!msg)
return;
@@ -13159,7 +13387,7 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
struct sk_buff *msg;
void *hdr;
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ msg = nlmsg_new(100 + ie_len, GFP_KERNEL);
if (!msg)
return;
@@ -13235,7 +13463,7 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr,
trace_cfg80211_notify_new_peer_candidate(dev, addr);
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ msg = nlmsg_new(100 + ie_len, gfp);
if (!msg)
return;
@@ -13606,7 +13834,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
struct sk_buff *msg;
void *hdr;
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ msg = nlmsg_new(100 + len, gfp);
if (!msg)
return -ENOMEM;
@@ -13650,7 +13878,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
trace_cfg80211_mgmt_tx_status(wdev, cookie, ack);
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ msg = nlmsg_new(100 + len, gfp);
if (!msg)
return;
@@ -13737,11 +13965,11 @@ static void cfg80211_send_cqm(struct sk_buff *msg, gfp_t gfp)
void cfg80211_cqm_rssi_notify(struct net_device *dev,
enum nl80211_cqm_rssi_threshold_event rssi_event,
- gfp_t gfp)
+ s32 rssi_level, gfp_t gfp)
{
struct sk_buff *msg;
- trace_cfg80211_cqm_rssi_notify(dev, rssi_event);
+ trace_cfg80211_cqm_rssi_notify(dev, rssi_event, rssi_level);
if (WARN_ON(rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW &&
rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH))
@@ -13755,6 +13983,10 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev,
rssi_event))
goto nla_put_failure;
+ if (rssi_level && nla_put_s32(msg, NL80211_ATTR_CQM_RSSI_LEVEL,
+ rssi_level))
+ goto nla_put_failure;
+
cfg80211_send_cqm(msg, gfp);
return;
@@ -14388,19 +14620,25 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) {
bool schedule_destroy_work = false;
- bool schedule_scan_stop = false;
struct cfg80211_sched_scan_request *sched_scan_req =
rcu_dereference(rdev->sched_scan_req);
if (sched_scan_req && notify->portid &&
- sched_scan_req->owner_nlportid == notify->portid)
- schedule_scan_stop = true;
+ sched_scan_req->owner_nlportid == notify->portid) {
+ sched_scan_req->owner_nlportid = 0;
+
+ if (rdev->ops->sched_scan_stop &&
+ rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN)
+ schedule_work(&rdev->sched_scan_stop_wk);
+ }
list_for_each_entry_rcu(wdev, &rdev->wiphy.wdev_list, list) {
cfg80211_mlme_unregister_socket(wdev, notify->portid);
if (wdev->owner_nlportid == notify->portid)
schedule_destroy_work = true;
+ else if (wdev->conn_owner_nlportid == notify->portid)
+ schedule_work(&wdev->disconnect_wk);
}
spin_lock_bh(&rdev->beacon_registrations_lock);
@@ -14425,12 +14663,6 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
spin_unlock(&rdev->destroy_list_lock);
schedule_work(&rdev->destroy_work);
}
- } else if (schedule_scan_stop) {
- sched_scan_req->owner_nlportid = 0;
-
- if (rdev->ops->sched_scan_stop &&
- rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN)
- schedule_work(&rdev->sched_scan_stop_wk);
}
}
@@ -14461,7 +14693,7 @@ void cfg80211_ft_event(struct net_device *netdev,
if (!ft_event->target_ap)
return;
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ msg = nlmsg_new(100 + ft_event->ric_ies_len, GFP_KERNEL);
if (!msg)
return;
@@ -14563,12 +14795,11 @@ void nl80211_send_ap_stopped(struct wireless_dev *wdev)
/* initialisation/exit functions */
-int nl80211_init(void)
+int __init nl80211_init(void)
{
int err;
- err = genl_register_family_with_ops_groups(&nl80211_fam, nl80211_ops,
- nl80211_mcgrps);
+ err = genl_register_family(&nl80211_fam);
if (err)
return err;
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 7e3821d7fcc5..e488dca87423 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -14,12 +14,10 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
struct wireless_dev *wdev);
struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
struct wireless_dev *wdev, bool aborted);
-void nl80211_send_scan_result(struct cfg80211_registered_device *rdev,
- struct sk_buff *msg);
+void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev,
+ struct sk_buff *msg);
void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
struct net_device *netdev, u32 cmd);
-void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev,
- struct net_device *netdev);
void nl80211_common_reg_change_event(enum nl80211_commands cmd_id,
struct regulatory_request *request);
@@ -58,7 +56,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
struct net_device *netdev, const u8 *bssid,
const u8 *req_ie, size_t req_ie_len,
const u8 *resp_ie, size_t resp_ie_len,
- int status, gfp_t gfp);
+ int status,
+ enum nl80211_timeout_reason timeout_reason,
+ gfp_t gfp);
void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
struct net_device *netdev, const u8 *bssid,
const u8 *req_ie, size_t req_ie_len,
diff --git a/net/wireless/of.c b/net/wireless/of.c
new file mode 100644
index 000000000000..de221f0edca5
--- /dev/null
+++ b/net/wireless/of.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2017 Rafał Miłecki <rafal@milecki.pl>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <linux/of.h>
+#include <net/cfg80211.h>
+#include "core.h"
+
+static bool wiphy_freq_limits_valid_chan(struct wiphy *wiphy,
+ struct ieee80211_freq_range *freq_limits,
+ unsigned int n_freq_limits,
+ struct ieee80211_channel *chan)
+{
+ u32 bw = MHZ_TO_KHZ(20);
+ int i;
+
+ for (i = 0; i < n_freq_limits; i++) {
+ struct ieee80211_freq_range *limit = &freq_limits[i];
+
+ if (cfg80211_does_bw_fit_range(limit,
+ MHZ_TO_KHZ(chan->center_freq),
+ bw))
+ return true;
+ }
+
+ return false;
+}
+
+static void wiphy_freq_limits_apply(struct wiphy *wiphy,
+ struct ieee80211_freq_range *freq_limits,
+ unsigned int n_freq_limits)
+{
+ enum nl80211_band band;
+ int i;
+
+ if (WARN_ON(!n_freq_limits))
+ return;
+
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
+ struct ieee80211_supported_band *sband = wiphy->bands[band];
+
+ if (!sband)
+ continue;
+
+ for (i = 0; i < sband->n_channels; i++) {
+ struct ieee80211_channel *chan = &sband->channels[i];
+
+ if (chan->flags & IEEE80211_CHAN_DISABLED)
+ continue;
+
+ if (!wiphy_freq_limits_valid_chan(wiphy, freq_limits,
+ n_freq_limits,
+ chan)) {
+ pr_debug("Disabling freq %d MHz as it's out of OF limits\n",
+ chan->center_freq);
+ chan->flags |= IEEE80211_CHAN_DISABLED;
+ }
+ }
+ }
+}
+
+void wiphy_read_of_freq_limits(struct wiphy *wiphy)
+{
+ struct device *dev = wiphy_dev(wiphy);
+ struct device_node *np;
+ struct property *prop;
+ struct ieee80211_freq_range *freq_limits;
+ unsigned int n_freq_limits;
+ const __be32 *p;
+ int len, i;
+ int err = 0;
+
+ if (!dev)
+ return;
+ np = dev_of_node(dev);
+ if (!np)
+ return;
+
+ prop = of_find_property(np, "ieee80211-freq-limit", &len);
+ if (!prop)
+ return;
+
+ if (!len || len % sizeof(u32) || len / sizeof(u32) % 2) {
+ dev_err(dev, "ieee80211-freq-limit wrong format");
+ return;
+ }
+ n_freq_limits = len / sizeof(u32) / 2;
+
+ freq_limits = kcalloc(n_freq_limits, sizeof(*freq_limits), GFP_KERNEL);
+ if (!freq_limits) {
+ err = -ENOMEM;
+ goto out_kfree;
+ }
+
+ p = NULL;
+ for (i = 0; i < n_freq_limits; i++) {
+ struct ieee80211_freq_range *limit = &freq_limits[i];
+
+ p = of_prop_next_u32(prop, p, &limit->start_freq_khz);
+ if (!p) {
+ err = -EINVAL;
+ goto out_kfree;
+ }
+
+ p = of_prop_next_u32(prop, p, &limit->end_freq_khz);
+ if (!p) {
+ err = -EINVAL;
+ goto out_kfree;
+ }
+
+ if (!limit->start_freq_khz ||
+ !limit->end_freq_khz ||
+ limit->start_freq_khz >= limit->end_freq_khz) {
+ err = -EINVAL;
+ goto out_kfree;
+ }
+ }
+
+ wiphy_freq_limits_apply(wiphy, freq_limits, n_freq_limits);
+
+out_kfree:
+ kfree(freq_limits);
+ if (err)
+ dev_err(dev, "Failed to get limits: %d\n", err);
+}
+EXPORT_SYMBOL(wiphy_read_of_freq_limits);
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 11cf83c8ad4f..2f425075ada8 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -490,6 +490,18 @@ static inline int rdev_connect(struct cfg80211_registered_device *rdev,
return ret;
}
+static inline int
+rdev_update_connect_params(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_connect_params *sme, u32 changed)
+{
+ int ret;
+ trace_rdev_update_connect_params(&rdev->wiphy, dev, sme, changed);
+ ret = rdev->ops->update_connect_params(&rdev->wiphy, dev, sme, changed);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
static inline int rdev_disconnect(struct cfg80211_registered_device *rdev,
struct net_device *dev, u16 reason_code)
{
@@ -562,6 +574,18 @@ static inline int rdev_set_wds_peer(struct cfg80211_registered_device *rdev,
return ret;
}
+static inline int
+rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ const bool enabled)
+{
+ int ret;
+ trace_rdev_set_multicast_to_unicast(&rdev->wiphy, dev, enabled);
+ ret = rdev->ops->set_multicast_to_unicast(&rdev->wiphy, dev, enabled);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev)
{
trace_rdev_rfkill_poll(&rdev->wiphy);
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 5dbac3749738..753efcd51fa3 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -748,21 +748,6 @@ static bool is_valid_rd(const struct ieee80211_regdomain *rd)
return true;
}
-static bool reg_does_bw_fit(const struct ieee80211_freq_range *freq_range,
- u32 center_freq_khz, u32 bw_khz)
-{
- u32 start_freq_khz, end_freq_khz;
-
- start_freq_khz = center_freq_khz - (bw_khz/2);
- end_freq_khz = center_freq_khz + (bw_khz/2);
-
- if (start_freq_khz >= freq_range->start_freq_khz &&
- end_freq_khz <= freq_range->end_freq_khz)
- return true;
-
- return false;
-}
-
/**
* freq_in_rule_band - tells us if a frequency is in a frequency band
* @freq_range: frequency rule we want to query
@@ -1070,7 +1055,7 @@ freq_reg_info_regd(u32 center_freq,
if (!band_rule_found)
band_rule_found = freq_in_rule_band(fr, center_freq);
- bw_fits = reg_does_bw_fit(fr, center_freq, bw);
+ bw_fits = cfg80211_does_bw_fit_range(fr, center_freq, bw);
if (band_rule_found && bw_fits)
return rr;
@@ -1138,11 +1123,13 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd
max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule);
/* If we get a reg_rule we can assume that at least 5Mhz fit */
- if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq),
- MHZ_TO_KHZ(10)))
+ if (!cfg80211_does_bw_fit_range(freq_range,
+ MHZ_TO_KHZ(chan->center_freq),
+ MHZ_TO_KHZ(10)))
bw_flags |= IEEE80211_CHAN_NO_10MHZ;
- if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq),
- MHZ_TO_KHZ(20)))
+ if (!cfg80211_does_bw_fit_range(freq_range,
+ MHZ_TO_KHZ(chan->center_freq),
+ MHZ_TO_KHZ(20)))
bw_flags |= IEEE80211_CHAN_NO_20MHZ;
if (max_bandwidth_khz < MHZ_TO_KHZ(10))
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 35ad69fd0838..21be56b3128e 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -227,7 +227,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
ASSERT_RTNL();
if (rdev->scan_msg) {
- nl80211_send_scan_result(rdev, rdev->scan_msg);
+ nl80211_send_scan_msg(rdev, rdev->scan_msg);
rdev->scan_msg = NULL;
return;
}
@@ -273,7 +273,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
if (!send_message)
rdev->scan_msg = msg;
else
- nl80211_send_scan_result(rdev, msg);
+ nl80211_send_scan_msg(rdev, msg);
}
void __cfg80211_scan_done(struct work_struct *wk)
@@ -321,7 +321,8 @@ void __cfg80211_sched_scan_results(struct work_struct *wk)
spin_unlock_bh(&rdev->bss_lock);
request->scan_start = jiffies;
}
- nl80211_send_sched_scan_results(rdev, request->dev);
+ nl80211_send_sched_scan(rdev, request->dev,
+ NL80211_CMD_SCHED_SCAN_RESULTS);
}
rtnl_unlock();
@@ -1147,7 +1148,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
else
rcu_assign_pointer(tmp.pub.beacon_ies, ies);
rcu_assign_pointer(tmp.pub.ies, ies);
-
+
memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN);
tmp.pub.channel = channel;
tmp.pub.scan_width = data->scan_width;
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index a77db333927e..b347e63d7aaa 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -34,11 +34,13 @@ struct cfg80211_conn {
CFG80211_CONN_SCAN_AGAIN,
CFG80211_CONN_AUTHENTICATE_NEXT,
CFG80211_CONN_AUTHENTICATING,
- CFG80211_CONN_AUTH_FAILED,
+ CFG80211_CONN_AUTH_FAILED_TIMEOUT,
CFG80211_CONN_ASSOCIATE_NEXT,
CFG80211_CONN_ASSOCIATING,
CFG80211_CONN_ASSOC_FAILED,
+ CFG80211_CONN_ASSOC_FAILED_TIMEOUT,
CFG80211_CONN_DEAUTH,
+ CFG80211_CONN_ABANDON,
CFG80211_CONN_CONNECTED,
} state;
u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN];
@@ -139,7 +141,8 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev)
return err;
}
-static int cfg80211_conn_do_work(struct wireless_dev *wdev)
+static int cfg80211_conn_do_work(struct wireless_dev *wdev,
+ enum nl80211_timeout_reason *treason)
{
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
struct cfg80211_connect_params *params;
@@ -170,7 +173,8 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev)
NULL, 0,
params->key, params->key_len,
params->key_idx, NULL, 0);
- case CFG80211_CONN_AUTH_FAILED:
+ case CFG80211_CONN_AUTH_FAILED_TIMEOUT:
+ *treason = NL80211_TIMEOUT_AUTH;
return -ENOTCONN;
case CFG80211_CONN_ASSOCIATE_NEXT:
if (WARN_ON(!rdev->ops->assoc))
@@ -197,6 +201,9 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev)
WLAN_REASON_DEAUTH_LEAVING,
false);
return err;
+ case CFG80211_CONN_ASSOC_FAILED_TIMEOUT:
+ *treason = NL80211_TIMEOUT_ASSOC;
+ /* fall through */
case CFG80211_CONN_ASSOC_FAILED:
cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid,
NULL, 0,
@@ -206,6 +213,8 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev)
cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid,
NULL, 0,
WLAN_REASON_DEAUTH_LEAVING, false);
+ /* fall through */
+ case CFG80211_CONN_ABANDON:
/* free directly, disconnected event already sent */
cfg80211_sme_free(wdev);
return 0;
@@ -220,6 +229,7 @@ void cfg80211_conn_work(struct work_struct *work)
container_of(work, struct cfg80211_registered_device, conn_work);
struct wireless_dev *wdev;
u8 bssid_buf[ETH_ALEN], *bssid = NULL;
+ enum nl80211_timeout_reason treason;
rtnl_lock();
@@ -241,10 +251,12 @@ void cfg80211_conn_work(struct work_struct *work)
memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN);
bssid = bssid_buf;
}
- if (cfg80211_conn_do_work(wdev)) {
+ treason = NL80211_TIMEOUT_UNSPECIFIED;
+ if (cfg80211_conn_do_work(wdev, &treason)) {
__cfg80211_connect_result(
wdev->netdev, bssid,
- NULL, 0, NULL, 0, -1, false, NULL);
+ NULL, 0, NULL, 0, -1, false, NULL,
+ treason);
}
wdev_unlock(wdev);
}
@@ -349,7 +361,8 @@ void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len)
} else if (status_code != WLAN_STATUS_SUCCESS) {
__cfg80211_connect_result(wdev->netdev, mgmt->bssid,
NULL, 0, NULL, 0,
- status_code, false, NULL);
+ status_code, false, NULL,
+ NL80211_TIMEOUT_UNSPECIFIED);
} else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) {
wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT;
schedule_work(&rdev->conn_work);
@@ -397,7 +410,7 @@ void cfg80211_sme_auth_timeout(struct wireless_dev *wdev)
if (!wdev->conn)
return;
- wdev->conn->state = CFG80211_CONN_AUTH_FAILED;
+ wdev->conn->state = CFG80211_CONN_AUTH_FAILED_TIMEOUT;
schedule_work(&rdev->conn_work);
}
@@ -419,7 +432,18 @@ void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev)
if (!wdev->conn)
return;
- wdev->conn->state = CFG80211_CONN_ASSOC_FAILED;
+ wdev->conn->state = CFG80211_CONN_ASSOC_FAILED_TIMEOUT;
+ schedule_work(&rdev->conn_work);
+}
+
+void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+
+ if (!wdev->conn)
+ return;
+
+ wdev->conn->state = CFG80211_CONN_ABANDON;
schedule_work(&rdev->conn_work);
}
@@ -550,7 +574,9 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev,
/* we're good if we have a matching bss struct */
if (bss) {
- err = cfg80211_conn_do_work(wdev);
+ enum nl80211_timeout_reason treason;
+
+ err = cfg80211_conn_do_work(wdev, &treason);
cfg80211_put_bss(wdev->wiphy, bss);
} else {
/* otherwise we'll need to scan for the AP first */
@@ -647,7 +673,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
const u8 *req_ie, size_t req_ie_len,
const u8 *resp_ie, size_t resp_ie_len,
int status, bool wextev,
- struct cfg80211_bss *bss)
+ struct cfg80211_bss *bss,
+ enum nl80211_timeout_reason timeout_reason)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
const u8 *country_ie;
@@ -666,7 +693,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev,
bssid, req_ie, req_ie_len,
resp_ie, resp_ie_len,
- status, GFP_KERNEL);
+ status, timeout_reason, GFP_KERNEL);
#ifdef CONFIG_CFG80211_WEXT
if (wextev) {
@@ -713,6 +740,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
kzfree(wdev->connect_keys);
wdev->connect_keys = NULL;
wdev->ssid_len = 0;
+ wdev->conn_owner_nlportid = 0;
if (bss) {
cfg80211_unhold_bss(bss_from_pub(bss));
cfg80211_put_bss(wdev->wiphy, bss);
@@ -756,7 +784,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
struct cfg80211_bss *bss, const u8 *req_ie,
size_t req_ie_len, const u8 *resp_ie,
- size_t resp_ie_len, int status, gfp_t gfp)
+ size_t resp_ie_len, int status, gfp_t gfp,
+ enum nl80211_timeout_reason timeout_reason)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
@@ -796,6 +825,7 @@ void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
cfg80211_hold_bss(bss_from_pub(bss));
ev->cr.bss = bss;
ev->cr.status = status;
+ ev->cr.timeout_reason = timeout_reason;
spin_lock_irqsave(&wdev->event_lock, flags);
list_add_tail(&ev->list, &wdev->event_list);
@@ -941,6 +971,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
wdev->current_bss = NULL;
wdev->ssid_len = 0;
+ wdev->conn_owner_nlportid = 0;
nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap);
@@ -1084,12 +1115,43 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
kzfree(wdev->connect_keys);
wdev->connect_keys = NULL;
+ wdev->conn_owner_nlportid = 0;
+
if (wdev->conn)
err = cfg80211_sme_disconnect(wdev, reason);
else if (!rdev->ops->disconnect)
cfg80211_mlme_down(rdev, dev);
- else if (wdev->current_bss)
+ else if (wdev->ssid_len)
err = rdev_disconnect(rdev, dev, reason);
return err;
}
+
+/*
+ * Used to clean up after the connection / connection attempt owner socket
+ * disconnects
+ */
+void cfg80211_autodisconnect_wk(struct work_struct *work)
+{
+ struct wireless_dev *wdev =
+ container_of(work, struct wireless_dev, disconnect_wk);
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+
+ wdev_lock(wdev);
+
+ if (wdev->conn_owner_nlportid) {
+ /*
+ * Use disconnect_bssid if still connecting and ops->disconnect
+ * not implemented. Otherwise we can use cfg80211_disconnect.
+ */
+ if (rdev->ops->disconnect || wdev->current_bss)
+ cfg80211_disconnect(rdev, wdev->netdev,
+ WLAN_REASON_DEAUTH_LEAVING, true);
+ else
+ cfg80211_mlme_deauth(rdev, wdev->netdev,
+ wdev->disconnect_bssid, NULL, 0,
+ WLAN_REASON_DEAUTH_LEAVING, false);
+ }
+
+ wdev_unlock(wdev);
+}
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index 14b3f007826d..570a2b67ca10 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -39,9 +39,11 @@ SHOW_FMT(address_mask, "%pM", wiphy.addr_mask);
static ssize_t name_show(struct device *dev,
struct device_attribute *attr,
- char *buf) {
+ char *buf)
+{
struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy;
- return sprintf(buf, "%s\n", dev_name(&wiphy->dev));
+
+ return sprintf(buf, "%s\n", wiphy_name(wiphy));
}
static DEVICE_ATTR_RO(name);
@@ -130,12 +132,10 @@ static int wiphy_resume(struct device *dev)
/* Age scan results with time spent in suspend */
cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at);
- if (rdev->ops->resume) {
- rtnl_lock();
- if (rdev->wiphy.registered)
- ret = rdev_resume(rdev);
- rtnl_unlock();
- }
+ rtnl_lock();
+ if (rdev->wiphy.registered && rdev->ops->resume)
+ ret = rdev_resume(rdev);
+ rtnl_unlock();
return ret;
}
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index a3d0a91b1e09..776e80cef9b4 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1281,6 +1281,24 @@ TRACE_EVENT(rdev_connect,
__entry->wpa_versions, __entry->flags, MAC_PR_ARG(prev_bssid))
);
+TRACE_EVENT(rdev_update_connect_params,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_connect_params *sme, u32 changed),
+ TP_ARGS(wiphy, netdev, sme, changed),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(u32, changed)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->changed = changed;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", parameters changed: %u",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->changed)
+);
+
TRACE_EVENT(rdev_set_cqm_rssi_config,
TP_PROTO(struct wiphy *wiphy,
struct net_device *netdev, s32 rssi_thold,
@@ -1897,18 +1915,18 @@ TRACE_EVENT(rdev_start_nan,
WIPHY_ENTRY
WDEV_ENTRY
__field(u8, master_pref)
- __field(u8, dual);
+ __field(u8, bands);
),
TP_fast_assign(
WIPHY_ASSIGN;
WDEV_ASSIGN;
__entry->master_pref = conf->master_pref;
- __entry->dual = conf->dual;
+ __entry->bands = conf->bands;
),
TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
- ", master preference: %u, dual: %d",
+ ", master preference: %u, bands: 0x%0x",
WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
- __entry->dual)
+ __entry->bands)
);
TRACE_EVENT(rdev_nan_change_conf,
@@ -1919,20 +1937,20 @@ TRACE_EVENT(rdev_nan_change_conf,
WIPHY_ENTRY
WDEV_ENTRY
__field(u8, master_pref)
- __field(u8, dual);
+ __field(u8, bands);
__field(u32, changes);
),
TP_fast_assign(
WIPHY_ASSIGN;
WDEV_ASSIGN;
__entry->master_pref = conf->master_pref;
- __entry->dual = conf->dual;
+ __entry->bands = conf->bands;
__entry->changes = changes;
),
TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
- ", master preference: %u, dual: %d, changes: %x",
+ ", master preference: %u, bands: 0x%0x, changes: %x",
WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
- __entry->dual, __entry->changes)
+ __entry->bands, __entry->changes)
);
DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan,
@@ -2472,18 +2490,21 @@ TRACE_EVENT(cfg80211_mgmt_tx_status,
TRACE_EVENT(cfg80211_cqm_rssi_notify,
TP_PROTO(struct net_device *netdev,
- enum nl80211_cqm_rssi_threshold_event rssi_event),
- TP_ARGS(netdev, rssi_event),
+ enum nl80211_cqm_rssi_threshold_event rssi_event,
+ s32 rssi_level),
+ TP_ARGS(netdev, rssi_event, rssi_level),
TP_STRUCT__entry(
NETDEV_ENTRY
__field(enum nl80211_cqm_rssi_threshold_event, rssi_event)
+ __field(s32, rssi_level)
),
TP_fast_assign(
NETDEV_ASSIGN;
__entry->rssi_event = rssi_event;
+ __entry->rssi_level = rssi_level;
),
- TP_printk(NETDEV_PR_FMT ", rssi event: %d",
- NETDEV_PR_ARG, __entry->rssi_event)
+ TP_printk(NETDEV_PR_FMT ", rssi event: %d, level: %d",
+ NETDEV_PR_ARG, __entry->rssi_event, __entry->rssi_level)
);
TRACE_EVENT(cfg80211_reg_can_beacon,
@@ -3030,6 +3051,25 @@ DEFINE_EVENT(wiphy_wdev_evt, rdev_abort_scan,
TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
TP_ARGS(wiphy, wdev)
);
+
+TRACE_EVENT(rdev_set_multicast_to_unicast,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ const bool enabled),
+ TP_ARGS(wiphy, netdev, enabled),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(bool, enabled)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->enabled = enabled;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", unicast: %s",
+ WIPHY_PR_ARG, NETDEV_PR_ARG,
+ BOOL_TO_STR(__entry->enabled))
+);
#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
#undef TRACE_INCLUDE_PATH
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 659b507b347d..68e5f2ecee1a 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -13,6 +13,7 @@
#include <net/dsfield.h>
#include <linux/if_vlan.h>
#include <linux/mpls.h>
+#include <linux/gcd.h>
#include "core.h"
#include "rdev-ops.h"
@@ -113,8 +114,7 @@ int ieee80211_frequency_to_channel(int freq)
}
EXPORT_SYMBOL(ieee80211_frequency_to_channel);
-struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy,
- int freq)
+struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq)
{
enum nl80211_band band;
struct ieee80211_supported_band *sband;
@@ -134,14 +134,13 @@ struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy,
return NULL;
}
-EXPORT_SYMBOL(__ieee80211_get_channel);
+EXPORT_SYMBOL(ieee80211_get_channel);
-static void set_mandatory_flags_band(struct ieee80211_supported_band *sband,
- enum nl80211_band band)
+static void set_mandatory_flags_band(struct ieee80211_supported_band *sband)
{
int i, want;
- switch (band) {
+ switch (sband->band) {
case NL80211_BAND_5GHZ:
want = 3;
for (i = 0; i < sband->n_bitrates; i++) {
@@ -191,6 +190,7 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband,
WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e);
break;
case NUM_NL80211_BANDS:
+ default:
WARN_ON(1);
break;
}
@@ -202,7 +202,7 @@ void ieee80211_set_bitrate_flags(struct wiphy *wiphy)
for (band = 0; band < NUM_NL80211_BANDS; band++)
if (wiphy->bands[band])
- set_mandatory_flags_band(wiphy->bands[band], band);
+ set_mandatory_flags_band(wiphy->bands[band]);
}
bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher)
@@ -618,8 +618,6 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
if (pskb_expand_head(skb, head_need, 0, GFP_ATOMIC))
return -ENOMEM;
-
- skb->truesize += head_need;
}
if (encaps_data) {
@@ -951,7 +949,7 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev)
ev->cr.resp_ie, ev->cr.resp_ie_len,
ev->cr.status,
ev->cr.status == WLAN_STATUS_SUCCESS,
- ev->cr.bss);
+ ev->cr.bss, ev->cr.timeout_reason);
break;
case EVENT_ROAMED:
__cfg80211_roamed(wdev, ev->rm.bss, ev->rm.req_ie,
@@ -1378,6 +1376,25 @@ static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id)
return false;
}
+static size_t skip_ie(const u8 *ies, size_t ielen, size_t pos)
+{
+ /* we assume a validly formed IEs buffer */
+ u8 len = ies[pos + 1];
+
+ pos += 2 + len;
+
+ /* the IE itself must have 255 bytes for fragments to follow */
+ if (len < 255)
+ return pos;
+
+ while (pos < ielen && ies[pos] == WLAN_EID_FRAGMENT) {
+ len = ies[pos + 1];
+ pos += 2 + len;
+ }
+
+ return pos;
+}
+
size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
const u8 *ids, int n_ids,
const u8 *after_ric, int n_after_ric,
@@ -1387,14 +1404,14 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
while (pos < ielen && ieee80211_id_in_list(ids, n_ids, ies[pos])) {
if (ies[pos] == WLAN_EID_RIC_DATA && n_after_ric) {
- pos += 2 + ies[pos + 1];
+ pos = skip_ie(ies, ielen, pos);
while (pos < ielen &&
!ieee80211_id_in_list(after_ric, n_after_ric,
ies[pos]))
- pos += 2 + ies[pos + 1];
+ pos = skip_ie(ies, ielen, pos);
} else {
- pos += 2 + ies[pos + 1];
+ pos = skip_ie(ies, ielen, pos);
}
}
@@ -1555,31 +1572,57 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
}
EXPORT_SYMBOL(ieee80211_chandef_to_operating_class);
-int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
- u32 beacon_int)
+static void cfg80211_calculate_bi_data(struct wiphy *wiphy, u32 new_beacon_int,
+ u32 *beacon_int_gcd,
+ bool *beacon_int_different)
{
struct wireless_dev *wdev;
- int res = 0;
- if (beacon_int < 10 || beacon_int > 10000)
- return -EINVAL;
+ *beacon_int_gcd = 0;
+ *beacon_int_different = false;
- list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
+ list_for_each_entry(wdev, &wiphy->wdev_list, list) {
if (!wdev->beacon_interval)
continue;
- if (wdev->beacon_interval != beacon_int) {
- res = -EINVAL;
- break;
+
+ if (!*beacon_int_gcd) {
+ *beacon_int_gcd = wdev->beacon_interval;
+ continue;
}
+
+ if (wdev->beacon_interval == *beacon_int_gcd)
+ continue;
+
+ *beacon_int_different = true;
+ *beacon_int_gcd = gcd(*beacon_int_gcd, wdev->beacon_interval);
+ }
+
+ if (new_beacon_int && *beacon_int_gcd != new_beacon_int) {
+ if (*beacon_int_gcd)
+ *beacon_int_different = true;
+ *beacon_int_gcd = gcd(*beacon_int_gcd, new_beacon_int);
}
+}
+
+int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
+ enum nl80211_iftype iftype, u32 beacon_int)
+{
+ /*
+ * This is just a basic pre-condition check; if interface combinations
+ * are possible the driver must already be checking those with a call
+ * to cfg80211_check_combinations(), in which case we'll validate more
+ * through the cfg80211_calculate_bi_data() call and code in
+ * cfg80211_iter_combinations().
+ */
- return res;
+ if (beacon_int < 10 || beacon_int > 10000)
+ return -EINVAL;
+
+ return 0;
}
int cfg80211_iter_combinations(struct wiphy *wiphy,
- const int num_different_channels,
- const u8 radar_detect,
- const int iftype_num[NUM_NL80211_IFTYPES],
+ struct iface_combination_params *params,
void (*iter)(const struct ieee80211_iface_combination *c,
void *data),
void *data)
@@ -1589,8 +1632,23 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
int i, j, iftype;
int num_interfaces = 0;
u32 used_iftypes = 0;
+ u32 beacon_int_gcd;
+ bool beacon_int_different;
- if (radar_detect) {
+ /*
+ * This is a bit strange, since the iteration used to rely only on
+ * the data given by the driver, but here it now relies on context,
+ * in form of the currently operating interfaces.
+ * This is OK for all current users, and saves us from having to
+ * push the GCD calculations into all the drivers.
+ * In the future, this should probably rely more on data that's in
+ * cfg80211 already - the only thing not would appear to be any new
+ * interfaces (while being brought up) and channel/radar data.
+ */
+ cfg80211_calculate_bi_data(wiphy, params->new_beacon_int,
+ &beacon_int_gcd, &beacon_int_different);
+
+ if (params->radar_detect) {
rcu_read_lock();
regdom = rcu_dereference(cfg80211_regdomain);
if (regdom)
@@ -1599,8 +1657,8 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
}
for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) {
- num_interfaces += iftype_num[iftype];
- if (iftype_num[iftype] > 0 &&
+ num_interfaces += params->iftype_num[iftype];
+ if (params->iftype_num[iftype] > 0 &&
!(wiphy->software_iftypes & BIT(iftype)))
used_iftypes |= BIT(iftype);
}
@@ -1614,7 +1672,7 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
if (num_interfaces > c->max_interfaces)
continue;
- if (num_different_channels > c->num_different_channels)
+ if (params->num_different_channels > c->num_different_channels)
continue;
limits = kmemdup(c->limits, sizeof(limits[0]) * c->n_limits,
@@ -1629,16 +1687,17 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
all_iftypes |= limits[j].types;
if (!(limits[j].types & BIT(iftype)))
continue;
- if (limits[j].max < iftype_num[iftype])
+ if (limits[j].max < params->iftype_num[iftype])
goto cont;
- limits[j].max -= iftype_num[iftype];
+ limits[j].max -= params->iftype_num[iftype];
}
}
- if (radar_detect != (c->radar_detect_widths & radar_detect))
+ if (params->radar_detect !=
+ (c->radar_detect_widths & params->radar_detect))
goto cont;
- if (radar_detect && c->radar_detect_regions &&
+ if (params->radar_detect && c->radar_detect_regions &&
!(c->radar_detect_regions & BIT(region)))
goto cont;
@@ -1650,6 +1709,14 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
if ((all_iftypes & used_iftypes) != used_iftypes)
goto cont;
+ if (beacon_int_gcd) {
+ if (c->beacon_int_min_gcd &&
+ beacon_int_gcd < c->beacon_int_min_gcd)
+ goto cont;
+ if (!c->beacon_int_min_gcd && beacon_int_different)
+ goto cont;
+ }
+
/* This combination covered all interface types and
* supported the requested numbers, so we're good.
*/
@@ -1672,14 +1739,11 @@ cfg80211_iter_sum_ifcombs(const struct ieee80211_iface_combination *c,
}
int cfg80211_check_combinations(struct wiphy *wiphy,
- const int num_different_channels,
- const u8 radar_detect,
- const int iftype_num[NUM_NL80211_IFTYPES])
+ struct iface_combination_params *params)
{
int err, num = 0;
- err = cfg80211_iter_combinations(wiphy, num_different_channels,
- radar_detect, iftype_num,
+ err = cfg80211_iter_combinations(wiphy, params,
cfg80211_iter_sum_ifcombs, &num);
if (err)
return err;
@@ -1781,6 +1845,21 @@ void cfg80211_free_nan_func(struct cfg80211_nan_func *f)
}
EXPORT_SYMBOL(cfg80211_free_nan_func);
+bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
+ u32 center_freq_khz, u32 bw_khz)
+{
+ u32 start_freq_khz, end_freq_khz;
+
+ start_freq_khz = center_freq_khz - (bw_khz / 2);
+ end_freq_khz = center_freq_khz + (bw_khz / 2);
+
+ if (start_freq_khz >= freq_range->start_freq_khz &&
+ end_freq_khz <= freq_range->end_freq_khz)
+ return true;
+
+ return false;
+}
+
/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
/* Ethernet-II snap header (RFC1042 for most EtherTypes) */
const unsigned char rfc1042_header[] __aligned(2) =
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 6250b1cfcde5..1a4db6790e20 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -1119,3 +1119,70 @@ int compat_wext_handle_ioctl(struct net *net, unsigned int cmd,
return ret;
}
#endif
+
+char *iwe_stream_add_event(struct iw_request_info *info, char *stream,
+ char *ends, struct iw_event *iwe, int event_len)
+{
+ int lcp_len = iwe_stream_lcp_len(info);
+
+ event_len = iwe_stream_event_len_adjust(info, event_len);
+
+ /* Check if it's possible */
+ if (likely((stream + event_len) < ends)) {
+ iwe->len = event_len;
+ /* Beware of alignement issues on 64 bits */
+ memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN);
+ memcpy(stream + lcp_len, &iwe->u,
+ event_len - lcp_len);
+ stream += event_len;
+ }
+
+ return stream;
+}
+EXPORT_SYMBOL(iwe_stream_add_event);
+
+char *iwe_stream_add_point(struct iw_request_info *info, char *stream,
+ char *ends, struct iw_event *iwe, char *extra)
+{
+ int event_len = iwe_stream_point_len(info) + iwe->u.data.length;
+ int point_len = iwe_stream_point_len(info);
+ int lcp_len = iwe_stream_lcp_len(info);
+
+ /* Check if it's possible */
+ if (likely((stream + event_len) < ends)) {
+ iwe->len = event_len;
+ memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN);
+ memcpy(stream + lcp_len,
+ ((char *) &iwe->u) + IW_EV_POINT_OFF,
+ IW_EV_POINT_PK_LEN - IW_EV_LCP_PK_LEN);
+ if (iwe->u.data.length && extra)
+ memcpy(stream + point_len, extra, iwe->u.data.length);
+ stream += event_len;
+ }
+
+ return stream;
+}
+EXPORT_SYMBOL(iwe_stream_add_point);
+
+char *iwe_stream_add_value(struct iw_request_info *info, char *event,
+ char *value, char *ends, struct iw_event *iwe,
+ int event_len)
+{
+ int lcp_len = iwe_stream_lcp_len(info);
+
+ /* Don't duplicate LCP */
+ event_len -= IW_EV_LCP_LEN;
+
+ /* Check if it's possible */
+ if (likely((value + event_len) < ends)) {
+ /* Add new value */
+ memcpy(value, &iwe->u, event_len);
+ value += event_len;
+ /* Patch LCP */
+ iwe->len = value - event;
+ memcpy(event, (char *) iwe, lcp_len);
+ }
+
+ return value;
+}
+EXPORT_SYMBOL(iwe_stream_add_value);
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index 995163830a61..c434f193f39a 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -105,30 +105,7 @@ int cfg80211_mgd_wext_siwfreq(struct net_device *dev,
goto out;
}
-
wdev->wext.connect.channel = chan;
-
- /*
- * SSID is not set, we just want to switch monitor channel,
- * this is really just backward compatibility, if the SSID
- * is set then we use the channel to select the BSS to use
- * to connect to instead. If we were connected on another
- * channel we disconnected above and reconnect below.
- */
- if (chan && !wdev->wext.connect.ssid_len) {
- struct cfg80211_chan_def chandef = {
- .width = NL80211_CHAN_WIDTH_20_NOHT,
- .center_freq1 = freq,
- };
-
- chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq);
- if (chandef.chan)
- err = cfg80211_set_monitor_channel(rdev, &chandef);
- else
- err = -EINVAL;
- goto out;
- }
-
err = cfg80211_mgd_wext_connect(rdev, wdev);
out:
wdev_unlock(wdev);
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index f83b74d3e2ac..8b911c29860e 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -41,7 +41,7 @@
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/net.h>
@@ -51,7 +51,7 @@
#include <linux/slab.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/termios.h> /* For TIOCINQ/OUTQ */
#include <linux/notifier.h>
@@ -852,7 +852,8 @@ static int x25_wait_for_data(struct sock *sk, long timeout)
return rc;
}
-static int x25_accept(struct socket *sock, struct socket *newsock, int flags)
+static int x25_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern)
{
struct sock *sk = sock->sk;
struct sock *newsk;
diff --git a/net/x25/sysctl_net_x25.c b/net/x25/sysctl_net_x25.c
index 43239527a205..a06dfe143c67 100644
--- a/net/x25/sysctl_net_x25.c
+++ b/net/x25/sysctl_net_x25.c
@@ -70,7 +70,7 @@ static struct ctl_table x25_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { 0, },
+ { },
};
void __init x25_register_sysctl(void)
diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c
index fd5ffb25873f..bcaa180d6a3f 100644
--- a/net/x25/x25_link.c
+++ b/net/x25/x25_link.c
@@ -29,7 +29,7 @@
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/init.h>
#include <net/x25.h>
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index bda1a13628a8..286ed25c1a69 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -4,6 +4,11 @@
config XFRM
bool
depends on NET
+ select GRO_CELLS
+
+config XFRM_OFFLOAD
+ bool
+ depends on XFRM
config XFRM_ALGO
tristate
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 6e3f0254d8a1..46bdb4fbed0b 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -19,16 +19,18 @@
static struct kmem_cache *secpath_cachep __read_mostly;
static DEFINE_SPINLOCK(xfrm_input_afinfo_lock);
-static struct xfrm_input_afinfo __rcu *xfrm_input_afinfo[NPROTO];
+static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1];
-int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo)
+static struct gro_cells gro_cells;
+static struct net_device xfrm_napi_dev;
+
+int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo)
{
int err = 0;
- if (unlikely(afinfo == NULL))
- return -EINVAL;
- if (unlikely(afinfo->family >= NPROTO))
+ if (WARN_ON(afinfo->family >= ARRAY_SIZE(xfrm_input_afinfo)))
return -EAFNOSUPPORT;
+
spin_lock_bh(&xfrm_input_afinfo_lock);
if (unlikely(xfrm_input_afinfo[afinfo->family] != NULL))
err = -EEXIST;
@@ -39,14 +41,10 @@ int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo)
}
EXPORT_SYMBOL(xfrm_input_register_afinfo);
-int xfrm_input_unregister_afinfo(struct xfrm_input_afinfo *afinfo)
+int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo)
{
int err = 0;
- if (unlikely(afinfo == NULL))
- return -EINVAL;
- if (unlikely(afinfo->family >= NPROTO))
- return -EAFNOSUPPORT;
spin_lock_bh(&xfrm_input_afinfo_lock);
if (likely(xfrm_input_afinfo[afinfo->family] != NULL)) {
if (unlikely(xfrm_input_afinfo[afinfo->family] != afinfo))
@@ -60,12 +58,13 @@ int xfrm_input_unregister_afinfo(struct xfrm_input_afinfo *afinfo)
}
EXPORT_SYMBOL(xfrm_input_unregister_afinfo);
-static struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family)
+static const struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family)
{
- struct xfrm_input_afinfo *afinfo;
+ const struct xfrm_input_afinfo *afinfo;
- if (unlikely(family >= NPROTO))
+ if (WARN_ON_ONCE(family >= ARRAY_SIZE(xfrm_input_afinfo)))
return NULL;
+
rcu_read_lock();
afinfo = rcu_dereference(xfrm_input_afinfo[family]);
if (unlikely(!afinfo))
@@ -73,22 +72,17 @@ static struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family)
return afinfo;
}
-static void xfrm_input_put_afinfo(struct xfrm_input_afinfo *afinfo)
-{
- rcu_read_unlock();
-}
-
static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol,
int err)
{
int ret;
- struct xfrm_input_afinfo *afinfo = xfrm_input_get_afinfo(family);
+ const struct xfrm_input_afinfo *afinfo = xfrm_input_get_afinfo(family);
if (!afinfo)
return -EAFNOSUPPORT;
ret = afinfo->callback(skb, protocol, err);
- xfrm_input_put_afinfo(afinfo);
+ rcu_read_unlock();
return ret;
}
@@ -111,6 +105,8 @@ struct sec_path *secpath_dup(struct sec_path *src)
return NULL;
sp->len = 0;
+ sp->olen = 0;
+
if (src) {
int i;
@@ -123,6 +119,24 @@ struct sec_path *secpath_dup(struct sec_path *src)
}
EXPORT_SYMBOL(secpath_dup);
+int secpath_set(struct sk_buff *skb)
+{
+ struct sec_path *sp;
+
+ /* Allocate new secpath or COW existing one. */
+ if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
+ sp = secpath_dup(skb->sp);
+ if (!sp)
+ return -ENOMEM;
+
+ if (skb->sp)
+ secpath_put(skb->sp);
+ skb->sp = sp;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(secpath_set);
+
/* Fetch spi and seq from ipsec header */
int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq)
@@ -158,6 +172,7 @@ int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq)
*seq = *(__be32 *)(skb_transport_header(skb) + offset_seq);
return 0;
}
+EXPORT_SYMBOL(xfrm_parse_spi);
int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb)
{
@@ -192,14 +207,23 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
unsigned int family;
int decaps = 0;
int async = 0;
+ struct xfrm_offload *xo;
+ bool xfrm_gro = false;
- /* A negative encap_type indicates async resumption. */
if (encap_type < 0) {
- async = 1;
x = xfrm_input_state(skb);
- seq = XFRM_SKB_CB(skb)->seq.input.low;
family = x->outer_mode->afinfo->family;
- goto resume;
+
+ /* An encap_type of -1 indicates async resumption. */
+ if (encap_type == -1) {
+ async = 1;
+ seq = XFRM_SKB_CB(skb)->seq.input.low;
+ goto resume;
+ }
+ /* encap_type < -1 indicates a GRO call. */
+ encap_type = 0;
+ seq = XFRM_SPI_SKB_CB(skb)->seq;
+ goto lock;
}
daddr = (xfrm_address_t *)(skb_network_header(skb) +
@@ -218,18 +242,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
break;
}
- /* Allocate new secpath or COW existing one. */
- if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
- struct sec_path *sp;
-
- sp = secpath_dup(skb->sp);
- if (!sp) {
- XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
- goto drop;
- }
- if (skb->sp)
- secpath_put(skb->sp);
- skb->sp = sp;
+ err = secpath_set(skb);
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
+ goto drop;
}
seq = 0;
@@ -253,6 +269,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
skb->sp->xvec[skb->sp->len++] = x;
+lock:
spin_lock(&x->lock);
if (unlikely(x->km.state != XFRM_STATE_VALID)) {
@@ -371,10 +388,21 @@ resume:
if (decaps) {
skb_dst_drop(skb);
- netif_rx(skb);
+ gro_cells_receive(&gro_cells, skb);
return 0;
} else {
- return x->inner_mode->afinfo->transport_finish(skb, async);
+ xo = xfrm_offload(skb);
+ if (xo)
+ xfrm_gro = xo->flags & XFRM_GRO;
+
+ err = x->inner_mode->afinfo->transport_finish(skb, async);
+ if (xfrm_gro) {
+ skb_dst_drop(skb);
+ gro_cells_receive(&gro_cells, skb);
+ return err;
+ }
+
+ return err;
}
drop_unlock:
@@ -394,6 +422,13 @@ EXPORT_SYMBOL(xfrm_input_resume);
void __init xfrm_input_init(void)
{
+ int err;
+
+ init_dummy_netdev(&xfrm_napi_dev);
+ err = gro_cells_init(&gro_cells, &xfrm_napi_dev);
+ if (err)
+ gro_cells.cells = NULL;
+
secpath_cachep = kmem_cache_create("secpath_cache",
sizeof(struct sec_path),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 637387bbaaea..8ba29fe58352 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -246,10 +246,8 @@ void xfrm_local_error(struct sk_buff *skb, int mtu)
return;
afinfo = xfrm_state_get_afinfo(proto);
- if (!afinfo)
- return;
-
- afinfo->local_error(skb, mtu);
- xfrm_state_put_afinfo(afinfo);
+ if (afinfo)
+ afinfo->local_error(skb, mtu);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(xfrm_local_error);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 5bf7e1bfeac7..236cbbc0ab9c 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -45,7 +45,7 @@ struct xfrm_flo {
};
static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
-static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO]
+static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
__read_mostly;
static struct kmem_cache *xfrm_dst_cache __read_mostly;
@@ -103,11 +103,11 @@ bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl
return false;
}
-static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
+static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
{
- struct xfrm_policy_afinfo *afinfo;
+ const struct xfrm_policy_afinfo *afinfo;
- if (unlikely(family >= NPROTO))
+ if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
return NULL;
rcu_read_lock();
afinfo = rcu_dereference(xfrm_policy_afinfo[family]);
@@ -116,18 +116,13 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
return afinfo;
}
-static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
-{
- rcu_read_unlock();
-}
-
static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
int tos, int oif,
const xfrm_address_t *saddr,
const xfrm_address_t *daddr,
int family)
{
- struct xfrm_policy_afinfo *afinfo;
+ const struct xfrm_policy_afinfo *afinfo;
struct dst_entry *dst;
afinfo = xfrm_policy_get_afinfo(family);
@@ -136,7 +131,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr);
- xfrm_policy_put_afinfo(afinfo);
+ rcu_read_unlock();
return dst;
}
@@ -330,7 +325,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
}
EXPORT_SYMBOL(xfrm_policy_destroy);
-/* Rule must be locked. Release descentant resources, announce
+/* Rule must be locked. Release descendant resources, announce
* entry dead. The rule must be unlinked from lists to the moment.
*/
@@ -1248,7 +1243,7 @@ static inline int policy_to_flow_dir(int dir)
}
static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
- const struct flowi *fl)
+ const struct flowi *fl, u16 family)
{
struct xfrm_policy *pol;
@@ -1256,8 +1251,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
again:
pol = rcu_dereference(sk->sk_policy[dir]);
if (pol != NULL) {
- bool match = xfrm_selector_match(&pol->selector, fl,
- sk->sk_family);
+ bool match = xfrm_selector_match(&pol->selector, fl, family);
int err = 0;
if (match) {
@@ -1431,12 +1425,12 @@ xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local,
xfrm_address_t *remote, unsigned short family)
{
int err;
- struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+ const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
if (unlikely(afinfo == NULL))
return -EINVAL;
err = afinfo->get_saddr(net, oif, local, remote);
- xfrm_policy_put_afinfo(afinfo);
+ rcu_read_unlock();
return err;
}
@@ -1538,21 +1532,15 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
}
-/* Check that the bundle accepts the flow and its components are
- * still valid.
- */
-
-static inline int xfrm_get_tos(const struct flowi *fl, int family)
+static int xfrm_get_tos(const struct flowi *fl, int family)
{
- struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
- int tos;
-
- if (!afinfo)
- return -EINVAL;
+ const struct xfrm_policy_afinfo *afinfo;
+ int tos = 0;
- tos = afinfo->get_tos(fl);
+ afinfo = xfrm_policy_get_afinfo(family);
+ tos = afinfo ? afinfo->get_tos(fl) : 0;
- xfrm_policy_put_afinfo(afinfo);
+ rcu_read_unlock();
return tos;
}
@@ -1609,7 +1597,7 @@ static const struct flow_cache_ops xfrm_bundle_fc_ops = {
static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
{
- struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+ const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
struct dst_ops *dst_ops;
struct xfrm_dst *xdst;
@@ -1638,7 +1626,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
} else
xdst = ERR_PTR(-ENOBUFS);
- xfrm_policy_put_afinfo(afinfo);
+ rcu_read_unlock();
return xdst;
}
@@ -1646,7 +1634,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
int nfheader_len)
{
- struct xfrm_policy_afinfo *afinfo =
+ const struct xfrm_policy_afinfo *afinfo =
xfrm_policy_get_afinfo(dst->ops->family);
int err;
@@ -1655,7 +1643,7 @@ static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
err = afinfo->init_path(path, dst, nfheader_len);
- xfrm_policy_put_afinfo(afinfo);
+ rcu_read_unlock();
return err;
}
@@ -1663,7 +1651,7 @@ static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
const struct flowi *fl)
{
- struct xfrm_policy_afinfo *afinfo =
+ const struct xfrm_policy_afinfo *afinfo =
xfrm_policy_get_afinfo(xdst->u.dst.ops->family);
int err;
@@ -1672,7 +1660,7 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
err = afinfo->fill_dst(xdst, dev, fl);
- xfrm_policy_put_afinfo(afinfo);
+ rcu_read_unlock();
return err;
}
@@ -1705,9 +1693,6 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
xfrm_flowi_addr_get(fl, &saddr, &daddr, family);
tos = xfrm_get_tos(fl, family);
- err = tos;
- if (tos < 0)
- goto put_states;
dst_hold(dst);
@@ -2215,7 +2200,7 @@ error:
static struct dst_entry *make_blackhole(struct net *net, u16 family,
struct dst_entry *dst_orig)
{
- struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+ const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
struct dst_entry *ret;
if (!afinfo) {
@@ -2224,7 +2209,7 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family,
} else {
ret = afinfo->blackhole_route(net, dst_orig);
}
- xfrm_policy_put_afinfo(afinfo);
+ rcu_read_unlock();
return ret;
}
@@ -2253,7 +2238,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
sk = sk_const_to_full_sk(sk);
if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
num_pols = 1;
- pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
+ pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family);
err = xfrm_expand_policies(fl, family, pols,
&num_pols, &num_xfrms);
if (err < 0)
@@ -2466,7 +2451,7 @@ xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int star
int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
unsigned int family, int reverse)
{
- struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+ const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
int err;
if (unlikely(afinfo == NULL))
@@ -2474,7 +2459,7 @@ int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
afinfo->decode_session(skb, fl, reverse);
err = security_xfrm_decode_session(skb, &fl->flowi_secid);
- xfrm_policy_put_afinfo(afinfo);
+ rcu_read_unlock();
return err;
}
EXPORT_SYMBOL(__xfrm_decode_session);
@@ -2532,7 +2517,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
pol = NULL;
sk = sk_to_full_sk(sk);
if (sk && sk->sk_policy[dir]) {
- pol = xfrm_sk_policy_lookup(sk, dir, &fl);
+ pol = xfrm_sk_policy_lookup(sk, dir, &fl, family);
if (IS_ERR(pol)) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
return 0;
@@ -2742,10 +2727,11 @@ void xfrm_garbage_collect(struct net *net)
}
EXPORT_SYMBOL(xfrm_garbage_collect);
-static void xfrm_garbage_collect_deferred(struct net *net)
+void xfrm_garbage_collect_deferred(struct net *net)
{
flow_cache_flush_deferred(net);
}
+EXPORT_SYMBOL(xfrm_garbage_collect_deferred);
static void xfrm_init_pmtu(struct dst_entry *dst)
{
@@ -2849,22 +2835,52 @@ static unsigned int xfrm_mtu(const struct dst_entry *dst)
return mtu ? : dst_mtu(dst->path);
}
+static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
+ const void *daddr)
+{
+ const struct dst_entry *path = dst->path;
+
+ for (; dst != path; dst = dst->child) {
+ const struct xfrm_state *xfrm = dst->xfrm;
+
+ if (xfrm->props.mode == XFRM_MODE_TRANSPORT)
+ continue;
+ if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR)
+ daddr = xfrm->coaddr;
+ else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR))
+ daddr = &xfrm->id.daddr;
+ }
+ return daddr;
+}
+
static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
struct sk_buff *skb,
const void *daddr)
{
- return dst->path->ops->neigh_lookup(dst, skb, daddr);
+ const struct dst_entry *path = dst->path;
+
+ if (!skb)
+ daddr = xfrm_get_dst_nexthop(dst, daddr);
+ return path->ops->neigh_lookup(path, skb, daddr);
+}
+
+static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+ const struct dst_entry *path = dst->path;
+
+ daddr = xfrm_get_dst_nexthop(dst, daddr);
+ path->ops->confirm_neigh(path, daddr);
}
-int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
+int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family)
{
int err = 0;
- if (unlikely(afinfo == NULL))
- return -EINVAL;
- if (unlikely(afinfo->family >= NPROTO))
+
+ if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
return -EAFNOSUPPORT;
+
spin_lock(&xfrm_policy_afinfo_lock);
- if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL))
+ if (unlikely(xfrm_policy_afinfo[family] != NULL))
err = -EEXIST;
else {
struct dst_ops *dst_ops = afinfo->dst_ops;
@@ -2882,9 +2898,9 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
dst_ops->link_failure = xfrm_link_failure;
if (likely(dst_ops->neigh_lookup == NULL))
dst_ops->neigh_lookup = xfrm_neigh_lookup;
- if (likely(afinfo->garbage_collect == NULL))
- afinfo->garbage_collect = xfrm_garbage_collect_deferred;
- rcu_assign_pointer(xfrm_policy_afinfo[afinfo->family], afinfo);
+ if (likely(!dst_ops->confirm_neigh))
+ dst_ops->confirm_neigh = xfrm_confirm_neigh;
+ rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo);
}
spin_unlock(&xfrm_policy_afinfo_lock);
@@ -2892,34 +2908,24 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
}
EXPORT_SYMBOL(xfrm_policy_register_afinfo);
-int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
+void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo)
{
- int err = 0;
- if (unlikely(afinfo == NULL))
- return -EINVAL;
- if (unlikely(afinfo->family >= NPROTO))
- return -EAFNOSUPPORT;
- spin_lock(&xfrm_policy_afinfo_lock);
- if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) {
- if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo))
- err = -EINVAL;
- else
- RCU_INIT_POINTER(xfrm_policy_afinfo[afinfo->family],
- NULL);
+ struct dst_ops *dst_ops = afinfo->dst_ops;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) {
+ if (xfrm_policy_afinfo[i] != afinfo)
+ continue;
+ RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL);
+ break;
}
- spin_unlock(&xfrm_policy_afinfo_lock);
- if (!err) {
- struct dst_ops *dst_ops = afinfo->dst_ops;
- synchronize_rcu();
+ synchronize_rcu();
- dst_ops->kmem_cachep = NULL;
- dst_ops->check = NULL;
- dst_ops->negative_advice = NULL;
- dst_ops->link_failure = NULL;
- afinfo->garbage_collect = NULL;
- }
- return err;
+ dst_ops->kmem_cachep = NULL;
+ dst_ops->check = NULL;
+ dst_ops->negative_advice = NULL;
+ dst_ops->link_failure = NULL;
}
EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
@@ -3062,6 +3068,11 @@ static int __net_init xfrm_net_init(struct net *net)
{
int rv;
+ /* Initialize the per-net locks here */
+ spin_lock_init(&net->xfrm.xfrm_state_lock);
+ spin_lock_init(&net->xfrm.xfrm_policy_lock);
+ mutex_init(&net->xfrm.xfrm_cfg_mutex);
+
rv = xfrm_statistics_init(net);
if (rv < 0)
goto out_statistics;
@@ -3078,11 +3089,6 @@ static int __net_init xfrm_net_init(struct net *net)
if (rv < 0)
goto out;
- /* Initialize the per-net locks here */
- spin_lock_init(&net->xfrm.xfrm_state_lock);
- spin_lock_init(&net->xfrm.xfrm_policy_lock);
- mutex_init(&net->xfrm.xfrm_cfg_mutex);
-
return 0;
out:
@@ -3113,6 +3119,7 @@ static struct pernet_operations __net_initdata xfrm_net_ops = {
void __init xfrm_init(void)
{
+ flow_cache_hp_init();
register_pernet_subsys(&xfrm_net_ops);
seqcount_init(&xfrm_policy_hash_generation);
xfrm_input_init();
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 419bf5d463bd..5a597dbbe564 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -20,7 +20,7 @@
#include <linux/module.h>
#include <linux/cache.h>
#include <linux/audit.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/ktime.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
@@ -192,7 +192,7 @@ int xfrm_register_type(const struct xfrm_type *type, unsigned short family)
else
err = -EEXIST;
spin_unlock_bh(&xfrm_type_lock);
- xfrm_state_put_afinfo(afinfo);
+ rcu_read_unlock();
return err;
}
EXPORT_SYMBOL(xfrm_register_type);
@@ -213,7 +213,7 @@ int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family)
else
typemap[type->proto] = NULL;
spin_unlock_bh(&xfrm_type_lock);
- xfrm_state_put_afinfo(afinfo);
+ rcu_read_unlock();
return err;
}
EXPORT_SYMBOL(xfrm_unregister_type);
@@ -231,17 +231,18 @@ retry:
return NULL;
typemap = afinfo->type_map;
- type = typemap[proto];
+ type = READ_ONCE(typemap[proto]);
if (unlikely(type && !try_module_get(type->owner)))
type = NULL;
+
+ rcu_read_unlock();
+
if (!type && !modload_attempted) {
- xfrm_state_put_afinfo(afinfo);
request_module("xfrm-type-%d-%d", family, proto);
modload_attempted = 1;
goto retry;
}
- xfrm_state_put_afinfo(afinfo);
return type;
}
@@ -280,7 +281,7 @@ int xfrm_register_mode(struct xfrm_mode *mode, int family)
out:
spin_unlock_bh(&xfrm_mode_lock);
- xfrm_state_put_afinfo(afinfo);
+ rcu_read_unlock();
return err;
}
EXPORT_SYMBOL(xfrm_register_mode);
@@ -308,7 +309,7 @@ int xfrm_unregister_mode(struct xfrm_mode *mode, int family)
}
spin_unlock_bh(&xfrm_mode_lock);
- xfrm_state_put_afinfo(afinfo);
+ rcu_read_unlock();
return err;
}
EXPORT_SYMBOL(xfrm_unregister_mode);
@@ -327,17 +328,17 @@ retry:
if (unlikely(afinfo == NULL))
return NULL;
- mode = afinfo->mode_map[encap];
+ mode = READ_ONCE(afinfo->mode_map[encap]);
if (unlikely(mode && !try_module_get(mode->owner)))
mode = NULL;
+
+ rcu_read_unlock();
if (!mode && !modload_attempted) {
- xfrm_state_put_afinfo(afinfo);
request_module("xfrm-mode-%d-%d", family, encap);
modload_attempted = 1;
goto retry;
}
- xfrm_state_put_afinfo(afinfo);
return mode;
}
@@ -388,14 +389,6 @@ static void xfrm_state_gc_task(struct work_struct *work)
xfrm_state_gc_destroy(x);
}
-static inline unsigned long make_jiffies(long secs)
-{
- if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
- return MAX_SCHEDULE_TIMEOUT-1;
- else
- return secs*HZ;
-}
-
static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
{
struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer);
@@ -417,7 +410,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
if (x->xflags & XFRM_SOFT_EXPIRE) {
/* enter hard expire without soft expire first?!
* setting a new date could trigger this.
- * workarbound: fix x->curflt.add_time by below:
+ * workaround: fix x->curflt.add_time by below:
*/
x->curlft.add_time = now - x->saved_tmo - 1;
tmo = x->lft.hard_add_expires_seconds - x->saved_tmo;
@@ -647,26 +640,25 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
}
EXPORT_SYMBOL(xfrm_sad_getinfo);
-static int
+static void
xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
const struct xfrm_tmpl *tmpl,
const xfrm_address_t *daddr, const xfrm_address_t *saddr,
unsigned short family)
{
- struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
+ struct xfrm_state_afinfo *afinfo = xfrm_state_afinfo_get_rcu(family);
+
if (!afinfo)
- return -1;
+ return;
+
afinfo->init_tempsel(&x->sel, fl);
if (family != tmpl->encap_family) {
- xfrm_state_put_afinfo(afinfo);
- afinfo = xfrm_state_get_afinfo(tmpl->encap_family);
+ afinfo = xfrm_state_afinfo_get_rcu(tmpl->encap_family);
if (!afinfo)
- return -1;
+ return;
}
afinfo->init_temprop(x, tmpl, daddr, saddr);
- xfrm_state_put_afinfo(afinfo);
- return 0;
}
static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
@@ -1412,7 +1404,7 @@ int xfrm_state_check_expire(struct xfrm_state *x)
if (x->curlft.bytes >= x->lft.hard_byte_limit ||
x->curlft.packets >= x->lft.hard_packet_limit) {
x->km.state = XFRM_STATE_EXPIRED;
- tasklet_hrtimer_start(&x->mtimer, ktime_set(0, 0), HRTIMER_MODE_REL);
+ tasklet_hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL);
return -EINVAL;
}
@@ -1482,7 +1474,7 @@ xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
if (afinfo->tmpl_sort)
err = afinfo->tmpl_sort(dst, src, n);
spin_unlock_bh(&net->xfrm.xfrm_state_lock);
- xfrm_state_put_afinfo(afinfo);
+ rcu_read_unlock();
return err;
}
EXPORT_SYMBOL(xfrm_tmpl_sort);
@@ -1502,7 +1494,7 @@ xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
if (afinfo->state_sort)
err = afinfo->state_sort(dst, src, n);
spin_unlock_bh(&net->xfrm.xfrm_state_lock);
- xfrm_state_put_afinfo(afinfo);
+ rcu_read_unlock();
return err;
}
EXPORT_SYMBOL(xfrm_state_sort);
@@ -1940,10 +1932,10 @@ EXPORT_SYMBOL(xfrm_unregister_km);
int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo)
{
int err = 0;
- if (unlikely(afinfo == NULL))
- return -EINVAL;
- if (unlikely(afinfo->family >= NPROTO))
+
+ if (WARN_ON(afinfo->family >= NPROTO))
return -EAFNOSUPPORT;
+
spin_lock_bh(&xfrm_state_afinfo_lock);
if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL))
err = -EEXIST;
@@ -1956,14 +1948,14 @@ EXPORT_SYMBOL(xfrm_state_register_afinfo);
int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
{
- int err = 0;
- if (unlikely(afinfo == NULL))
- return -EINVAL;
- if (unlikely(afinfo->family >= NPROTO))
+ int err = 0, family = afinfo->family;
+
+ if (WARN_ON(family >= NPROTO))
return -EAFNOSUPPORT;
+
spin_lock_bh(&xfrm_state_afinfo_lock);
if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) {
- if (unlikely(xfrm_state_afinfo[afinfo->family] != afinfo))
+ if (rcu_access_pointer(xfrm_state_afinfo[family]) != afinfo)
err = -EINVAL;
else
RCU_INIT_POINTER(xfrm_state_afinfo[afinfo->family], NULL);
@@ -1974,6 +1966,14 @@ int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
}
EXPORT_SYMBOL(xfrm_state_unregister_afinfo);
+struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family)
+{
+ if (unlikely(family >= NPROTO))
+ return NULL;
+
+ return rcu_dereference(xfrm_state_afinfo[family]);
+}
+
struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family)
{
struct xfrm_state_afinfo *afinfo;
@@ -1986,11 +1986,6 @@ struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family)
return afinfo;
}
-void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo)
-{
- rcu_read_unlock();
-}
-
/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */
void xfrm_state_delete_tunnel(struct xfrm_state *x)
{
@@ -2008,16 +2003,13 @@ EXPORT_SYMBOL(xfrm_state_delete_tunnel);
int xfrm_state_mtu(struct xfrm_state *x, int mtu)
{
- int res;
+ const struct xfrm_type *type = READ_ONCE(x->type);
- spin_lock_bh(&x->lock);
if (x->km.state == XFRM_STATE_VALID &&
- x->type && x->type->get_mtu)
- res = x->type->get_mtu(x, mtu);
- else
- res = mtu - x->props.header_len;
- spin_unlock_bh(&x->lock);
- return res;
+ type && type->get_mtu)
+ return type->get_mtu(x, mtu);
+
+ return mtu - x->props.header_len;
}
int __xfrm_init_state(struct xfrm_state *x, bool init_replay)
@@ -2036,7 +2028,7 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay)
if (afinfo->init_flags)
err = afinfo->init_flags(x);
- xfrm_state_put_afinfo(afinfo);
+ rcu_read_unlock();
if (err)
goto error;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 671a1d0333f0..40a8aa39220d 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -27,7 +27,7 @@
#include <net/xfrm.h>
#include <net/netlink.h>
#include <net/ah.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <linux/in6.h>
#endif
@@ -412,7 +412,14 @@ static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_es
up = nla_data(rp);
ulen = xfrm_replay_state_esn_len(up);
- if (nla_len(rp) < ulen || xfrm_replay_state_esn_len(replay_esn) != ulen)
+ /* Check the overall length and the internal bitmap length to avoid
+ * potential overflow. */
+ if (nla_len(rp) < ulen ||
+ xfrm_replay_state_esn_len(replay_esn) != ulen ||
+ replay_esn->bmp_len != up->bmp_len)
+ return -EINVAL;
+
+ if (up->replay_window > up->bmp_len * sizeof(__u32) * 8)
return -EINVAL;
return 0;