From e785fa0a164aa11001cba931367c7f94ffaff888 Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Wed, 13 Sep 2017 00:21:21 +0200 Subject: nl80211: check for the required netlink attributes presence nl80211_set_rekey_data() does not check if the required attributes NL80211_REKEY_DATA_{REPLAY_CTR,KEK,KCK} are present when processing NL80211_CMD_SET_REKEY_OFFLOAD request. This request can be issued by users with CAP_NET_ADMIN privilege and may result in NULL dereference and a system crash. Add a check for the required attributes presence. This patch is based on the patch by bo Zhang. This fixes CVE-2017-12153. References: https://bugzilla.redhat.com/show_bug.cgi?id=1491046 Fixes: e5497d766ad ("cfg80211/nl80211: support GTK rekey offload") Cc: # v3.1-rc1 Reported-by: bo Zhang Signed-off-by: Vladis Dronov Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0df8023f480b..fbd5593e88cb 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10903,6 +10903,9 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) if (err) return err; + if (!tb[NL80211_REKEY_DATA_REPLAY_CTR] || !tb[NL80211_REKEY_DATA_KEK] || + !tb[NL80211_REKEY_DATA_KCK]) + return -EINVAL; if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN) return -ERANGE; if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN) -- cgit From b0ade85165b3caeb0cd908cffe5921a39f25c243 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 10 Sep 2017 13:41:41 +0200 Subject: netfilter: nat: Do not use ARRAY_SIZE() on spinlocks to fix zero div MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If no spinlock debugging options (CONFIG_GENERIC_LOCKBREAK, CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_LOCK_ALLOC) are enabled on a UP platform (e.g. m68k defconfig), arch_spinlock_t is an empty struct, hence using ARRAY_SIZE(nf_nat_locks) causes a division by zero: net/netfilter/nf_nat_core.c: In function ‘nf_nat_setup_info’: net/netfilter/nf_nat_core.c:432: warning: division by zero net/netfilter/nf_nat_core.c: In function ‘__nf_nat_cleanup_conntrack’: net/netfilter/nf_nat_core.c:535: warning: division by zero net/netfilter/nf_nat_core.c:537: warning: division by zero net/netfilter/nf_nat_core.c: In function ‘nf_nat_init’: net/netfilter/nf_nat_core.c:810: warning: division by zero net/netfilter/nf_nat_core.c:811: warning: division by zero net/netfilter/nf_nat_core.c:824: warning: division by zero Fix this by using the CONNTRACK_LOCKS definition instead. Suggested-by: Florian Westphal Fixes: 8073e960a03bf7b5 ("netfilter: nat: use keyed locks") Signed-off-by: Geert Uytterhoeven Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index f393a7086025..af8345fc4fbd 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -429,7 +429,7 @@ nf_nat_setup_info(struct nf_conn *ct, srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - lock = &nf_nat_locks[srchash % ARRAY_SIZE(nf_nat_locks)]; + lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); @@ -532,9 +532,9 @@ static void __nf_nat_cleanup_conntrack(struct nf_conn *ct) unsigned int h; h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - spin_lock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); + spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); - spin_unlock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); + spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); } static int nf_nat_proto_clean(struct nf_conn *ct, void *data) @@ -807,8 +807,8 @@ static int __init nf_nat_init(void) /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; - if (nf_nat_htable_size < ARRAY_SIZE(nf_nat_locks)) - nf_nat_htable_size = ARRAY_SIZE(nf_nat_locks); + if (nf_nat_htable_size < CONNTRACK_LOCKS) + nf_nat_htable_size = CONNTRACK_LOCKS; nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); if (!nf_nat_bysource) @@ -821,7 +821,7 @@ static int __init nf_nat_init(void) return ret; } - for (i = 0; i < ARRAY_SIZE(nf_nat_locks); i++) + for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_nat_locks[i]); nf_ct_helper_expectfn_register(&follow_master_nat); -- cgit From 7f4f7dd4417d9efd038b14d39c70170db2e0baa0 Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Mon, 11 Sep 2017 21:52:40 +0200 Subject: netfilter: ipset: ipset list may return wrong member count for set with timeout Simple testcase: $ ipset create test hash:ip timeout 5 $ ipset add test 1.2.3.4 $ ipset add test 1.2.2.2 $ sleep 5 $ ipset l Name: test Type: hash:ip Revision: 5 Header: family inet hashsize 1024 maxelem 65536 timeout 5 Size in memory: 296 References: 0 Number of entries: 2 Members: We return "Number of entries: 2" but no members are listed. That is because mtype_list runs "ip_set_timeout_expired" and does not list the expired entries, but set->elements is never upated (until mtype_gc cleans it up later). Reviewed-by: Joshua Hunt Signed-off-by: Vishwanath Pai Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_gen.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index f236c0bc7b3f..51063d9ed0f7 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1041,12 +1041,24 @@ out: static int mtype_head(struct ip_set *set, struct sk_buff *skb) { - const struct htype *h = set->data; + struct htype *h = set->data; const struct htable *t; struct nlattr *nested; size_t memsize; u8 htable_bits; + /* If any members have expired, set->elements will be wrong + * mytype_expire function will update it with the right count. + * we do not hold set->lock here, so grab it first. + * set->elements can still be incorrect in the case of a huge set, + * because elements might time out during the listing. + */ + if (SET_WITH_TIMEOUT(set)) { + spin_lock_bh(&set->lock); + mtype_expire(set, h); + spin_unlock_bh(&set->lock); + } + rcu_read_lock_bh(); t = rcu_dereference_bh_nfnl(h->table); memsize = mtype_ahash_memsize(h, t) + set->ext_size; -- cgit From 63ecc3d9436f8012e49dc846d6cb0a85a3433517 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Wed, 13 Sep 2017 19:30:51 -0600 Subject: udpv6: Fix the checksum computation when HW checksum does not apply While trying an ESP transport mode encryption for UDPv6 packets of datagram size 1436 with MTU 1500, checksum error was observed in the secondary fragment. This error occurs due to the UDP payload checksum being missed out when computing the full checksum for these packets in udp6_hwcsum_outgoing(). Fixes: d39d938c8228 ("ipv6: Introduce udpv6_send_skb()") Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- net/ipv6/udp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e2ecfb137297..40d7234c27b9 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1015,6 +1015,7 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, */ offset = skb_transport_offset(skb); skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); + csum = skb->csum; skb->ip_summed = CHECKSUM_NONE; -- cgit From 265698d7e6132a2d41471135534f4f36ad15b09c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 18 Sep 2017 22:46:36 +0200 Subject: nl80211: fix null-ptr dereference on invalid mesh configuration If TX rates are specified during mesh join, the channel must also be specified. Check the channel pointer to avoid a null pointer dereference if it isn't. Reported-by: Jouni Malinen Fixes: 8564e38206de ("cfg80211: add checks for beacon rate, extend to mesh") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fbd5593e88cb..690874293cfc 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9987,6 +9987,9 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) if (err) return err; + if (!setup.chandef.chan) + return -EINVAL; + err = validate_beacon_tx_rate(rdev, setup.chandef.chan->band, &setup.beacon_rate); if (err) -- cgit From 76cc0d3282d4b933fa144fa41fbc5318e0fdca24 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 15 Sep 2017 12:00:07 +0800 Subject: ip6_gre: skb_push ipv6hdr before packing the header in ip6gre_header Now in ip6gre_header before packing the ipv6 header, it skb_push t->hlen which only includes encap_hlen + tun_hlen. It means greh and inner header would be over written by ipv6 stuff and ipv6h might have no chance to set up. Jianlin found this issue when using remote any on ip6_gre, the packets he captured on gre dev are truncated: 22:50:26.210866 Out ethertype IPv6 (0x86dd), length 120: truncated-ip6 -\ 8128 bytes missing!(flowlabel 0x92f40, hlim 0, next-header Options (0) \ payload length: 8192) ::1:2000:0 > ::1:0:86dd: HBH [trunc] ip-proto-128 \ 8184 It should also skb_push ipv6hdr so that ipv6h points to the right position to set ipv6 stuff up. This patch is to skb_push hlen + sizeof(*ipv6h) and also fix some indents in ip6gre_header. Fixes: c12b395a4664 ("gre: Support GRE over IPv6") Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b7a72d409334..20f66f4c9460 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -940,24 +940,25 @@ done: } static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, - const void *daddr, const void *saddr, unsigned int len) + unsigned short type, const void *daddr, + const void *saddr, unsigned int len) { struct ip6_tnl *t = netdev_priv(dev); - struct ipv6hdr *ipv6h = skb_push(skb, t->hlen); - __be16 *p = (__be16 *)(ipv6h+1); + struct ipv6hdr *ipv6h; + __be16 *p; - ip6_flow_hdr(ipv6h, 0, - ip6_make_flowlabel(dev_net(dev), skb, - t->fl.u.ip6.flowlabel, true, - &t->fl.u.ip6)); + ipv6h = skb_push(skb, t->hlen + sizeof(*ipv6h)); + ip6_flow_hdr(ipv6h, 0, ip6_make_flowlabel(dev_net(dev), skb, + t->fl.u.ip6.flowlabel, + true, &t->fl.u.ip6)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; ipv6h->saddr = t->parms.laddr; ipv6h->daddr = t->parms.raddr; - p[0] = t->parms.o_flags; - p[1] = htons(type); + p = (__be16 *)(ipv6h + 1); + p[0] = t->parms.o_flags; + p[1] = htons(type); /* * Set the source hardware address. -- cgit From f2654a4781318dc7ab8d6cde66f1fa39eab980a9 Mon Sep 17 00:00:00 2001 From: Fahad Kunnathadi Date: Fri, 15 Sep 2017 12:01:58 +0530 Subject: net: phy: Fix mask value write on gmii2rgmii converter speed register To clear Speed Selection in MDIO control register(0x10), ie, clear bits 6 and 13 to zero while keeping other bits same. Before AND operation,The Mask value has to be perform with bitwise NOT operation (ie, ~ operator) This patch clears current speed selection before writing the new speed settings to gmii2rgmii converter Fixes: f411a6160bd4 ("net: phy: Add gmiitorgmii converter support") Signed-off-by: Fahad Kunnathadi Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/xilinx_gmii2rgmii.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/xilinx_gmii2rgmii.c b/drivers/net/phy/xilinx_gmii2rgmii.c index d15dd3938ba8..2e5150b0b8d5 100644 --- a/drivers/net/phy/xilinx_gmii2rgmii.c +++ b/drivers/net/phy/xilinx_gmii2rgmii.c @@ -44,7 +44,7 @@ static int xgmiitorgmii_read_status(struct phy_device *phydev) priv->phy_drv->read_status(phydev); val = mdiobus_read(phydev->mdio.bus, priv->addr, XILINX_GMII2RGMII_REG); - val &= XILINX_GMII2RGMII_SPEED_MASK; + val &= ~XILINX_GMII2RGMII_SPEED_MASK; if (phydev->speed == SPEED_1000) val |= BMCR_SPEED1000; -- cgit From 8c22dab03ad072e45060c299c70d02a4f6fc4aab Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 15 Sep 2017 15:58:33 +0800 Subject: ip6_tunnel: do not allow loading ip6_tunnel if ipv6 is disabled in cmdline If ipv6 has been disabled from cmdline since kernel started, it makes no sense to allow users to create any ip6 tunnel. Otherwise, it could some potential problem. Jianlin found a kernel crash caused by this in ip6_gre when he set ipv6.disable=1 in grub: [ 209.588865] Unable to handle kernel paging request for data at address 0x00000080 [ 209.588872] Faulting instruction address: 0xc000000000a3aa6c [ 209.588879] Oops: Kernel access of bad area, sig: 11 [#1] [ 209.589062] NIP [c000000000a3aa6c] fib_rules_lookup+0x4c/0x260 [ 209.589071] LR [c000000000b9ad90] fib6_rule_lookup+0x50/0xb0 [ 209.589076] Call Trace: [ 209.589097] fib6_rule_lookup+0x50/0xb0 [ 209.589106] rt6_lookup+0xc4/0x110 [ 209.589116] ip6gre_tnl_link_config+0x214/0x2f0 [ip6_gre] [ 209.589125] ip6gre_newlink+0x138/0x3a0 [ip6_gre] [ 209.589134] rtnl_newlink+0x798/0xb80 [ 209.589142] rtnetlink_rcv_msg+0xec/0x390 [ 209.589151] netlink_rcv_skb+0x138/0x150 [ 209.589159] rtnetlink_rcv+0x48/0x70 [ 209.589169] netlink_unicast+0x538/0x640 [ 209.589175] netlink_sendmsg+0x40c/0x480 [ 209.589184] ___sys_sendmsg+0x384/0x4e0 [ 209.589194] SyS_sendmsg+0xd4/0x140 [ 209.589201] SyS_socketcall+0x3e0/0x4f0 [ 209.589209] system_call+0x38/0xe0 This patch is to return -EOPNOTSUPP in ip6_tunnel_init if ipv6 has been disabled from cmdline. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index ae73164559d5..f2f21c24915f 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2259,6 +2259,9 @@ static int __init ip6_tunnel_init(void) { int err; + if (!ipv6_mod_enabled()) + return -EOPNOTSUPP; + err = register_pernet_device(&ip6_tnl_net_ops); if (err < 0) goto out_pernet; -- cgit From 3ff4cbec87da48b0ec1f7b6196607b034de0c680 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Sat, 16 Sep 2017 14:02:21 +0200 Subject: net/sched: cls_matchall: fix crash when used with classful qdisc this script, edited from Linux Advanced Routing and Traffic Control guide tc q a dev en0 root handle 1: htb default a tc c a dev en0 parent 1: classid 1:1 htb rate 6mbit burst 15k tc c a dev en0 parent 1:1 classid 1:a htb rate 5mbit ceil 6mbit burst 15k tc c a dev en0 parent 1:1 classid 1:b htb rate 1mbit ceil 6mbit burst 15k tc f a dev en0 parent 1:0 prio 1 $clsname $clsargs classid 1:b ping $address -c1 tc -s c s dev en0 classifies traffic to 1:b or 1:a, depending on whether the packet matches or not the pattern $clsargs of filter $clsname. However, when $clsname is 'matchall', a systematic crash can be observed in htb_classify(). HTB and classful qdiscs don't assign initial value to struct tcf_result, but then they expect it to contain valid values after filters have been run. Thus, current 'matchall' ignores the TCA_MATCHALL_CLASSID attribute, configured by user, and makes HTB (and classful qdiscs) dereference random pointers. By assigning head->res to *res in mall_classify(), before the actions are invoked, we fix this crash and enable TCA_MATCHALL_CLASSID functionality, that had no effect on 'matchall' classifier since its first introduction. BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1460213 Reported-by: Jiri Benc Fixes: b87f7936a932 ("net/sched: introduce Match-all classifier") Signed-off-by: Davide Caratti Acked-by: Yotam Gigi Signed-off-by: David S. Miller --- net/sched/cls_matchall.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 21cc45caf842..eeac606c95ab 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -32,6 +32,7 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (tc_skip_sw(head->flags)) return -1; + *res = head->res; return tcf_exts_exec(skb, &head->exts, res); } -- cgit From 51513748ddfe7a5d2812158a1e0570499b0c511c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 16 Sep 2017 13:10:06 -0700 Subject: Documentation: networking: fix ASCII art in switchdev.txt Fix ASCII art in Documentation/networking/switchdev.txt: Change non-ASCII "spaces" to ASCII spaces. Change 2 erroneous '+' characters in ASCII art to '-' (at the '*' characters below): line 32: +--+----+----+----+-*--+----+---+ +-----+-----+ line 41: +--------------+---*------------+ Signed-off-by: Randy Dunlap Acked-by: Pavel Machek Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/networking/switchdev.txt | 68 +++++++++++++++++----------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt index 5e40e1f68873..82236a17b5e6 100644 --- a/Documentation/networking/switchdev.txt +++ b/Documentation/networking/switchdev.txt @@ -13,42 +13,42 @@ an example setup using a data-center-class switch ASIC chip. Other setups with SR-IOV or soft switches, such as OVS, are possible. -                             User-space tools - -       user space                   | -      +-------------------------------------------------------------------+ -       kernel                       | Netlink -                                    | -                     +--------------+-------------------------------+ -                     |         Network stack                        | -                     |           (Linux)                            | -                     |                                              | -                     +----------------------------------------------+ + User-space tools + + user space | + +-------------------------------------------------------------------+ + kernel | Netlink + | + +--------------+-------------------------------+ + | Network stack | + | (Linux) | + | | + +----------------------------------------------+ sw1p2 sw1p4 sw1p6 -                      sw1p1  + sw1p3 +  sw1p5 +         eth1 -                        +    |    +    |    +    |            + -                        |    |    |    |    |    |            | -                     +--+----+----+----+-+--+----+---+  +-----+-----+ -                     |         Switch driver         |  |    mgmt   | -                     |        (this document)        |  |   driver  | -                     |                               |  |           | -                     +--------------+----------------+  +-----------+ -                                    | -       kernel                       | HW bus (eg PCI) -      +-------------------------------------------------------------------+ -       hardware                     | -                     +--------------+---+------------+ -                     |         Switch device (sw1)   | -                     |  +----+                       +--------+ -                     |  |    v offloaded data path   | mgmt port -                     |  |    |                       | -                     +--|----|----+----+----+----+---+ -                        |    |    |    |    |    | -                        +    +    +    +    +    + -                       p1   p2   p3   p4   p5   p6 - -                             front-panel ports + sw1p1 + sw1p3 + sw1p5 + eth1 + + | + | + | + + | | | | | | | + +--+----+----+----+----+----+---+ +-----+-----+ + | Switch driver | | mgmt | + | (this document) | | driver | + | | | | + +--------------+----------------+ +-----------+ + | + kernel | HW bus (eg PCI) + +-------------------------------------------------------------------+ + hardware | + +--------------+----------------+ + | Switch device (sw1) | + | +----+ +--------+ + | | v offloaded data path | mgmt port + | | | | + +--|----|----+----+----+----+---+ + | | | | | | + + + + + + + + p1 p2 p3 p4 p5 p6 + + front-panel ports Fig 1. -- cgit From 1e3c5ec66119783440ed211ae527674651affa9b Mon Sep 17 00:00:00 2001 From: Sathya Perla Date: Mon, 18 Sep 2017 17:05:37 +0530 Subject: bnxt_en: check for ingress qdisc in flower offload Check for ingress-only qdisc for flower offload, as other qdiscs are not supported for flower offload. Suggested-by: Jiri Pirko Signed-off-by: Sathya Perla Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index ccd699fb2d70..7dd3d131043a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -750,6 +750,10 @@ int bnxt_tc_setup_flower(struct bnxt *bp, u16 src_fid, { int rc = 0; + if (!is_classid_clsact_ingress(cls_flower->common.classid) || + cls_flower->common.chain_index) + return -EOPNOTSUPP; + switch (cls_flower->command) { case TC_CLSFLOWER_REPLACE: rc = bnxt_tc_add_flow(bp, src_fid, cls_flower); -- cgit From 582db7e0c4c2fc5bb4f932f268035883385e3692 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 18 Sep 2017 15:03:46 +0200 Subject: bpf: devmap: pass on return value of bpf_map_precharge_memlock If bpf_map_precharge_memlock in dev_map_alloc, -ENOMEM is returned regardless of the actual error produced by bpf_map_precharge_memlock. Fix it by passing on the error returned by bpf_map_precharge_memlock. Also return -EINVAL instead of -ENOMEM if the page count overflow check fails. This makes dev_map_alloc match the behavior of other bpf maps' alloc functions wrt. return values. Signed-off-by: Tobias Klauser Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 959c9a07f318..e093d9a2c4dd 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -75,8 +75,8 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr) static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; + int err = -EINVAL; u64 cost; - int err; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -108,6 +108,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (err) goto free_dtab; + err = -ENOMEM; + /* A per cpu bitfield with a bit per possible net device */ dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), __alignof__(unsigned long)); @@ -128,7 +130,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) free_dtab: free_percpu(dtab->flush_needed); kfree(dtab); - return ERR_PTR(-ENOMEM); + return ERR_PTR(err); } static void dev_map_free(struct bpf_map *map) -- cgit From 4c7124413aa759b8ea0b90cd39177e525396e662 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 18 Sep 2017 11:05:16 -0700 Subject: tcp: remove two unused functions remove tcp_may_send_now and tcp_snd_test that are no longer used Fixes: 840a3cbe8969 ("tcp: remove forward retransmit feature") Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 - net/ipv4/tcp_output.c | 34 ---------------------------------- 2 files changed, 35 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index b510f284427a..3bc910a9bfc6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -544,7 +544,6 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, int min_tso_segs); void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle); -bool tcp_may_send_now(struct sock *sk); int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); void tcp_retransmit_timer(struct sock *sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1c839c99114c..517d737059d1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1806,40 +1806,6 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, return !after(end_seq, tcp_wnd_end(tp)); } -/* This checks if the data bearing packet SKB (usually tcp_send_head(sk)) - * should be put on the wire right now. If so, it returns the number of - * packets allowed by the congestion window. - */ -static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, - unsigned int cur_mss, int nonagle) -{ - const struct tcp_sock *tp = tcp_sk(sk); - unsigned int cwnd_quota; - - tcp_init_tso_segs(skb, cur_mss); - - if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) - return 0; - - cwnd_quota = tcp_cwnd_test(tp, skb); - if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss)) - cwnd_quota = 0; - - return cwnd_quota; -} - -/* Test if sending is allowed right now. */ -bool tcp_may_send_now(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = tcp_send_head(sk); - - return skb && - tcp_snd_test(sk, skb, tcp_current_mss(sk), - (tcp_skb_is_last(sk, skb) ? - tp->nonagle : TCP_NAGLE_PUSH)); -} - /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet * which is put after SKB on the list. It is very much like * tcp_fragment() except that it may make several kinds of assumptions -- cgit From 129c6cda2de2a8ac44fab096152469999b727faf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 18 Sep 2017 13:03:43 -0700 Subject: 8139too: revisit napi_complete_done() usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It seems we have to be more careful in napi_complete_done() use. This patch is not a revert, as it seems we can avoid bug that Ville reported by moving the napi_complete_done() test in the spinlock section. Many thanks to Ville for detective work and all tests. Fixes: 617f01211baf ("8139too: use napi_complete_done()") Reported-by: Ville Syrjälä Tested-by: Ville Syrjälä Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/8139too.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c index ca22f2898664..d24b47b8e0b2 100644 --- a/drivers/net/ethernet/realtek/8139too.c +++ b/drivers/net/ethernet/realtek/8139too.c @@ -2135,11 +2135,12 @@ static int rtl8139_poll(struct napi_struct *napi, int budget) if (likely(RTL_R16(IntrStatus) & RxAckBits)) work_done += rtl8139_rx(dev, tp, budget); - if (work_done < budget && napi_complete_done(napi, work_done)) { + if (work_done < budget) { unsigned long flags; spin_lock_irqsave(&tp->lock, flags); - RTL_W16_F(IntrMask, rtl8139_intr_mask); + if (napi_complete_done(napi, work_done)) + RTL_W16_F(IntrMask, rtl8139_intr_mask); spin_unlock_irqrestore(&tp->lock, flags); } spin_unlock(&tp->rx_lock); -- cgit From 8ecb1a29e11e24c458ee4ee59447d0ddf8274589 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 18 Sep 2017 16:31:30 -0700 Subject: net: systemport: Fix 64-bit statistics dependency There are several problems with commit 10377ba7673d ("net: systemport: Support 64bit statistics", first one got fixed in 7095c973453e ("net: systemport: Fix 64-bit stats deadlock"). The second problem is that this specific code updates the stats64.tx_{packets,bytes} from ndo_get_stats64() and that is what we are returning to ethtool -S. If we are not running a tool that involves calling ndo_get_stats64(), then we won't get updated ethtool stats. The solution to this is to update the stats from both call sites, factoring that into a specific function, While at it, don't just check the sizeof() but also the type of the statistics in order to use the 64-bit stats seqlock. Fixes: 10377ba7673d ("net: systemport: Support 64bit statistics") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcmsysport.c | 52 ++++++++++++++++++------------ 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index c3c53f6cd9e6..83eec9a8c275 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -432,6 +432,27 @@ static void bcm_sysport_update_mib_counters(struct bcm_sysport_priv *priv) netif_dbg(priv, hw, priv->netdev, "updated MIB counters\n"); } +static void bcm_sysport_update_tx_stats(struct bcm_sysport_priv *priv, + u64 *tx_bytes, u64 *tx_packets) +{ + struct bcm_sysport_tx_ring *ring; + u64 bytes = 0, packets = 0; + unsigned int start; + unsigned int q; + + for (q = 0; q < priv->netdev->num_tx_queues; q++) { + ring = &priv->tx_rings[q]; + do { + start = u64_stats_fetch_begin_irq(&priv->syncp); + bytes = ring->bytes; + packets = ring->packets; + } while (u64_stats_fetch_retry_irq(&priv->syncp, start)); + + *tx_bytes += bytes; + *tx_packets += packets; + } +} + static void bcm_sysport_get_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { @@ -439,11 +460,16 @@ static void bcm_sysport_get_stats(struct net_device *dev, struct bcm_sysport_stats64 *stats64 = &priv->stats64; struct u64_stats_sync *syncp = &priv->syncp; struct bcm_sysport_tx_ring *ring; + u64 tx_bytes = 0, tx_packets = 0; unsigned int start; int i, j; - if (netif_running(dev)) + if (netif_running(dev)) { bcm_sysport_update_mib_counters(priv); + bcm_sysport_update_tx_stats(priv, &tx_bytes, &tx_packets); + stats64->tx_bytes = tx_bytes; + stats64->tx_packets = tx_packets; + } for (i = 0, j = 0; i < BCM_SYSPORT_STATS_LEN; i++) { const struct bcm_sysport_stats *s; @@ -461,12 +487,13 @@ static void bcm_sysport_get_stats(struct net_device *dev, continue; p += s->stat_offset; - if (s->stat_sizeof == sizeof(u64)) + if (s->stat_sizeof == sizeof(u64) && + s->type == BCM_SYSPORT_STAT_NETDEV64) { do { start = u64_stats_fetch_begin_irq(syncp); data[i] = *(u64 *)p; } while (u64_stats_fetch_retry_irq(syncp, start)); - else + } else data[i] = *(u32 *)p; j++; } @@ -1716,27 +1743,12 @@ static void bcm_sysport_get_stats64(struct net_device *dev, { struct bcm_sysport_priv *priv = netdev_priv(dev); struct bcm_sysport_stats64 *stats64 = &priv->stats64; - struct bcm_sysport_tx_ring *ring; - u64 tx_packets = 0, tx_bytes = 0; unsigned int start; - unsigned int q; netdev_stats_to_stats64(stats, &dev->stats); - for (q = 0; q < dev->num_tx_queues; q++) { - ring = &priv->tx_rings[q]; - do { - start = u64_stats_fetch_begin_irq(&priv->syncp); - tx_bytes = ring->bytes; - tx_packets = ring->packets; - } while (u64_stats_fetch_retry_irq(&priv->syncp, start)); - - stats->tx_bytes += tx_bytes; - stats->tx_packets += tx_packets; - } - - stats64->tx_bytes = stats->tx_bytes; - stats64->tx_packets = stats->tx_packets; + bcm_sysport_update_tx_stats(priv, &stats->tx_bytes, + &stats->tx_packets); do { start = u64_stats_fetch_begin_irq(&priv->syncp); -- cgit From 3993491bf27117782bee05debc6a6afa51d61760 Mon Sep 17 00:00:00 2001 From: Ariel Elior Date: Tue, 19 Sep 2017 12:54:34 +0300 Subject: MAINTAINERS: Remove Yuval Mintz from maintainers list Remove Yuval from maintaining the bnx2x & qed* modules as he is no longer working for the company. Thanks Yuval for your huge contributions and tireless efforts over the many years and various companies. Ariel Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- MAINTAINERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2281af4b41b6..955f034fd523 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2853,7 +2853,6 @@ S: Supported F: drivers/scsi/bnx2i/ BROADCOM BNX2X 10 GIGABIT ETHERNET DRIVER -M: Yuval Mintz M: Ariel Elior M: everest-linux-l2@cavium.com L: netdev@vger.kernel.org @@ -11047,7 +11046,6 @@ S: Supported F: drivers/scsi/qedi/ QLOGIC QL4xxx ETHERNET DRIVER -M: Yuval Mintz M: Ariel Elior M: everest-linux-l2@cavium.com L: netdev@vger.kernel.org -- cgit From 6073512cc8e2c48bed5c6625c02c5e4ae50cec34 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Mon, 18 Sep 2017 14:59:20 +0200 Subject: net: phy: Kconfig: Fix PHY infrastructure menu in menuconfig Since the integration of PHYLINK, the configuration option which used to be under the PHY infrastructure menu in menuconfig ended up one level up (the network device driver section) By placing PHYLINK option right after PHYLIB entry, it broke the way Kconfig used to build the menu. See kconfig-language.txt, section "Menu structure", 2nd method. This is fixed by placing the PHYLINK option just before PHYLIB. Fixes: 9525ae83959b ("phylink: add phylink infrastructure") Signed-off-by: Jerome Brunet Signed-off-by: David S. Miller --- drivers/net/phy/Kconfig | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index a9d16a3af514..cd931cf9dcc2 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -160,15 +160,6 @@ config MDIO_XGENE endif -menuconfig PHYLIB - tristate "PHY Device support and infrastructure" - depends on NETDEVICES - select MDIO_DEVICE - help - Ethernet controllers are usually attached to PHY - devices. This option provides infrastructure for - managing PHY devices. - config PHYLINK tristate depends on NETDEVICES @@ -179,6 +170,15 @@ config PHYLINK configuration links, PHYs, and Serdes links with MAC level autonegotiation modes. +menuconfig PHYLIB + tristate "PHY Device support and infrastructure" + depends on NETDEVICES + select MDIO_DEVICE + help + Ethernet controllers are usually attached to PHY + devices. This option provides infrastructure for + managing PHY devices. + if PHYLIB config SWPHY -- cgit From 0647169cf9aa441700eb8f23ea49be060626534b Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 19 Sep 2017 12:41:37 +0200 Subject: rhashtable: Documentation tweak Clarify that rhashtable_walk_{stop,start} will not reset the iterator to the beginning of the hash table. Confusion between rhashtable_walk_enter and rhashtable_walk_start has already lead to a bug. Signed-off-by: Andreas Gruenbacher Signed-off-by: David S. Miller --- lib/rhashtable.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 707ca5d677c6..ddd7dde87c3c 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -735,9 +735,9 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_exit); * rhashtable_walk_start - Start a hash table walk * @iter: Hash table iterator * - * Start a hash table walk. Note that we take the RCU lock in all - * cases including when we return an error. So you must always call - * rhashtable_walk_stop to clean up. + * Start a hash table walk at the current iterator position. Note that we take + * the RCU lock in all cases including when we return an error. So you must + * always call rhashtable_walk_stop to clean up. * * Returns zero if successful. * @@ -846,7 +846,8 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_next); * rhashtable_walk_stop - Finish a hash table walk * @iter: Hash table iterator * - * Finish a hash table walk. + * Finish a hash table walk. Does not reset the iterator to the start of the + * hash table. */ void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU) -- cgit From 930651a75bf1ba6893a8b8475270664ebdb6cf4a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 09:15:59 -0700 Subject: bpf: do not disable/enable BH in bpf_map_free_id() syzkaller reported following splat [1] Since hard irq are disabled by the caller, bpf_map_free_id() should not try to enable/disable BH. Another solution would be to change htab_map_delete_elem() to defer the free_htab_elem() call after raw_spin_unlock_irqrestore(&b->lock, flags), but this might be not enough to cover other code paths. [1] WARNING: CPU: 1 PID: 8052 at kernel/softirq.c:161 __local_bh_enable_ip +0x1e/0x160 kernel/softirq.c:161 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 8052 Comm: syz-executor1 Not tainted 4.13.0-next-20170915+ #23 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x417 kernel/panic.c:181 __warn+0x1c4/0x1d9 kernel/panic.c:542 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178 do_trap_no_signal arch/x86/kernel/traps.c:212 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:261 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905 RIP: 0010:__local_bh_enable_ip+0x1e/0x160 kernel/softirq.c:161 RSP: 0018:ffff8801cdcd7748 EFLAGS: 00010046 RAX: 0000000000000082 RBX: 0000000000000201 RCX: 0000000000000000 RDX: 1ffffffff0b5933c RSI: 0000000000000201 RDI: ffffffff85ac99e0 RBP: ffff8801cdcd7758 R08: ffffffff85b87158 R09: 1ffff10039b9aec6 R10: ffff8801c99f24c0 R11: 0000000000000002 R12: ffffffff817b0b47 R13: dffffc0000000000 R14: ffff8801cdcd77e8 R15: 0000000000000001 __raw_spin_unlock_bh include/linux/spinlock_api_smp.h:176 [inline] _raw_spin_unlock_bh+0x30/0x40 kernel/locking/spinlock.c:207 spin_unlock_bh include/linux/spinlock.h:361 [inline] bpf_map_free_id kernel/bpf/syscall.c:197 [inline] __bpf_map_put+0x267/0x320 kernel/bpf/syscall.c:227 bpf_map_put+0x1a/0x20 kernel/bpf/syscall.c:235 bpf_map_fd_put_ptr+0x15/0x20 kernel/bpf/map_in_map.c:96 free_htab_elem+0xc3/0x1b0 kernel/bpf/hashtab.c:658 htab_map_delete_elem+0x74d/0x970 kernel/bpf/hashtab.c:1063 map_delete_elem kernel/bpf/syscall.c:633 [inline] SYSC_bpf kernel/bpf/syscall.c:1479 [inline] SyS_bpf+0x2188/0x46a0 kernel/bpf/syscall.c:1451 entry_SYSCALL_64_fastpath+0x1f/0xbe Fixes: f3f1c054c288 ("bpf: Introduce bpf_map ID") Signed-off-by: Eric Dumazet Cc: Martin KaFai Lau Acked-by: Martin KaFai Lau Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb17e1cd1d43..25d074920a00 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -186,15 +186,17 @@ static int bpf_map_alloc_id(struct bpf_map *map) static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { + unsigned long flags; + if (do_idr_lock) - spin_lock_bh(&map_idr_lock); + spin_lock_irqsave(&map_idr_lock, flags); else __acquire(&map_idr_lock); idr_remove(&map_idr, map->id); if (do_idr_lock) - spin_unlock_bh(&map_idr_lock); + spin_unlock_irqrestore(&map_idr_lock, flags); else __release(&map_idr_lock); } -- cgit From 2a4776e14f7da3d48fffb4edbb82355742f23478 Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:10 +0100 Subject: net: hns3: Fixes initialization of phy address from firmware Default phy address of every port is 0. Therefore, phy address for each port need to be fetched from firmware and device initialized with fetched non-default phy address. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Lipeng Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index bb45365fb817..db4e07dac29a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1066,6 +1066,7 @@ static int hclge_configure(struct hclge_dev *hdev) for (i = 0; i < ETH_ALEN; i++) hdev->hw.mac.mac_addr[i] = cfg.mac_addr[i]; hdev->hw.mac.media_type = cfg.media_type; + hdev->hw.mac.phy_addr = cfg.phy_addr; hdev->num_desc = cfg.tqp_desc_num; hdev->tm_info.num_pg = 1; hdev->tm_info.num_tc = cfg.tc_num; -- cgit From c5b1b97522ef32d2170c9aa1a0c1eec179acbb3a Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:11 +0100 Subject: net: hns3: Fixes the command used to unmap ring from vector This patch fixes the IMP command being used to unmap the vector from the corresponding ring. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Lipeng Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index db4e07dac29a..e324bc6e9f4f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2779,7 +2779,7 @@ static int hclge_unmap_ring_from_vector( } i = 0; hclge_cmd_setup_basic_desc(&desc, - HCLGE_OPC_ADD_RING_TO_VECTOR, + HCLGE_OPC_DEL_RING_TO_VECTOR, false); req->int_vector_id = vector_id; } -- cgit From 0305b443a3ba5b84aa474786026df04a70460135 Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:12 +0100 Subject: net: hns3: Fixes ring-to-vector map-and-unmap command This patch fixes the vector-to-ring map and unmap command and adds INT_GL(for, Gap Limiting Interrupts) and VF id to it as required by the hardware interface. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Lipeng Signed-off-by: Mingguang Qu Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 8 ++++++-- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 8 ++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index 91ae0135ee50..c2b613b40509 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -238,7 +238,7 @@ struct hclge_tqp_map { u8 rsv[18]; }; -#define HCLGE_VECTOR_ELEMENTS_PER_CMD 11 +#define HCLGE_VECTOR_ELEMENTS_PER_CMD 10 enum hclge_int_type { HCLGE_INT_TX, @@ -252,8 +252,12 @@ struct hclge_ctrl_vector_chain { #define HCLGE_INT_TYPE_S 0 #define HCLGE_INT_TYPE_M 0x3 #define HCLGE_TQP_ID_S 2 -#define HCLGE_TQP_ID_M (0x3fff << HCLGE_TQP_ID_S) +#define HCLGE_TQP_ID_M (0x7ff << HCLGE_TQP_ID_S) +#define HCLGE_INT_GL_IDX_S 13 +#define HCLGE_INT_GL_IDX_M (0x3 << HCLGE_INT_GL_IDX_S) __le16 tqp_type_and_id[HCLGE_VECTOR_ELEMENTS_PER_CMD]; + u8 vfid; + u8 rsv; }; #define HCLGE_TC_NUM 8 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e324bc6e9f4f..eafd9c678162 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2680,7 +2680,11 @@ int hclge_map_vport_ring_to_vector(struct hclge_vport *vport, int vector_id, hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); hnae_set_field(req->tqp_type_and_id[i], HCLGE_TQP_ID_M, HCLGE_TQP_ID_S, node->tqp_index); + hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_GL_IDX_M, + HCLGE_INT_GL_IDX_S, + hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); req->tqp_type_and_id[i] = cpu_to_le16(req->tqp_type_and_id[i]); + req->vfid = vport->vport_id; if (++i >= HCLGE_VECTOR_ELEMENTS_PER_CMD) { req->int_cause_num = HCLGE_VECTOR_ELEMENTS_PER_CMD; @@ -2764,8 +2768,12 @@ static int hclge_unmap_ring_from_vector( hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); hnae_set_field(req->tqp_type_and_id[i], HCLGE_TQP_ID_M, HCLGE_TQP_ID_S, node->tqp_index); + hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_GL_IDX_M, + HCLGE_INT_GL_IDX_S, + hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); req->tqp_type_and_id[i] = cpu_to_le16(req->tqp_type_and_id[i]); + req->vfid = vport->vport_id; if (++i >= HCLGE_VECTOR_ELEMENTS_PER_CMD) { req->int_cause_num = HCLGE_VECTOR_ELEMENTS_PER_CMD; -- cgit From 139e8792537b327252a9676591a78e6408d50a85 Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:13 +0100 Subject: net: hns3: Fixes the initialization of MAC address in hardware This patch fixes the initialization of MAC address, fetched from HNS3 firmware i.e. when it is not randomly generated, to the HNS3 hardware. Fixes: ca60906d2795 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC") Signed-off-by: Lipeng Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c index 1c3e29447891..4d68d6ea5143 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c @@ -2705,10 +2705,11 @@ static void hns3_init_mac_addr(struct net_device *netdev) eth_hw_addr_random(netdev); dev_warn(priv->dev, "using random MAC address %pM\n", netdev->dev_addr); - /* Also copy this new MAC address into hdev */ - if (h->ae_algo->ops->set_mac_addr) - h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr); } + + if (h->ae_algo->ops->set_mac_addr) + h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr); + } static void hns3_nic_set_priv_ops(struct net_device *netdev) -- cgit From fbbb1536b220eb4f4f95cbceae6579489a8adad5 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 19 Sep 2017 17:17:14 +0100 Subject: net: hns3: Fixes the ether address copy with appropriate API This patch replaces the ethernet address copy instance with more appropriate ether_addr_copy() function. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index eafd9c678162..8e172afd4876 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1063,8 +1063,7 @@ static int hclge_configure(struct hclge_dev *hdev) hdev->base_tqp_pid = 0; hdev->rss_size_max = 1; hdev->rx_buf_len = cfg.rx_buf_len; - for (i = 0; i < ETH_ALEN; i++) - hdev->hw.mac.mac_addr[i] = cfg.mac_addr[i]; + ether_addr_copy(hdev->hw.mac.mac_addr, cfg.mac_addr); hdev->hw.mac.media_type = cfg.media_type; hdev->hw.mac.phy_addr = cfg.phy_addr; hdev->num_desc = cfg.tqp_desc_num; -- cgit From 5e43aef8491ae3b5feb79cd15260faf39303ef33 Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:15 +0100 Subject: net: hns3: Fixes the default VLAN-id of PF When there is no vlan id in the packets, hardware will treat the vlan id as 0 and look for the mac_vlan table. This patch set the default vlan id of PF as 0. Without this config, it will fail when look for mac_vlan table, and hardware will drop packets. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Mingguang Qu Signed-off-by: Lipeng Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 8e172afd4876..74008ef23169 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3673,6 +3673,7 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev) { #define HCLGE_VLAN_TYPE_VF_TABLE 0 #define HCLGE_VLAN_TYPE_PORT_TABLE 1 + struct hnae3_handle *handle; int ret; ret = hclge_set_vlan_filter_ctrl(hdev, HCLGE_VLAN_TYPE_VF_TABLE, @@ -3682,8 +3683,11 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev) ret = hclge_set_vlan_filter_ctrl(hdev, HCLGE_VLAN_TYPE_PORT_TABLE, true); + if (ret) + return ret; - return ret; + handle = &hdev->vport[0].nic; + return hclge_set_port_vlan_filter(handle, htons(ETH_P_8021Q), 0, false); } static int hclge_set_mtu(struct hnae3_handle *handle, int new_mtu) -- cgit From 90f7b11a5a0081feb7041fcc795c9a131a62a725 Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:16 +0100 Subject: net: hns3: Fixes the premature exit of loop when matching clients When register/unregister ae_dev, ae_dev should match all client in the client_list. Enet and roce can co-exists together so we should continue checking for enet and roce presence together. So break should not be there. Above caused problems in loading and unloading of modules. Fixes: 38eddd126772 ("net: hns3: Add support of the HNAE3 framework") Signed-off-by: Lipeng Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.c | 43 ++++++----------------------- 1 file changed, 9 insertions(+), 34 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c index 59efbd605416..5bcb2238acb2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c @@ -37,20 +37,15 @@ static bool hnae3_client_match(enum hnae3_client_type client_type, } static int hnae3_match_n_instantiate(struct hnae3_client *client, - struct hnae3_ae_dev *ae_dev, - bool is_reg, bool *matched) + struct hnae3_ae_dev *ae_dev, bool is_reg) { int ret; - *matched = false; - /* check if this client matches the type of ae_dev */ if (!(hnae3_client_match(client->type, ae_dev->dev_type) && hnae_get_bit(ae_dev->flag, HNAE3_DEV_INITED_B))) { return 0; } - /* there is a match of client and dev */ - *matched = true; /* now, (un-)instantiate client by calling lower layer */ if (is_reg) { @@ -69,7 +64,6 @@ int hnae3_register_client(struct hnae3_client *client) { struct hnae3_client *client_tmp; struct hnae3_ae_dev *ae_dev; - bool matched; int ret = 0; mutex_lock(&hnae3_common_lock); @@ -86,7 +80,7 @@ int hnae3_register_client(struct hnae3_client *client) /* if the client could not be initialized on current port, for * any error reasons, move on to next available port */ - ret = hnae3_match_n_instantiate(client, ae_dev, true, &matched); + ret = hnae3_match_n_instantiate(client, ae_dev, true); if (ret) dev_err(&ae_dev->pdev->dev, "match and instantiation failed for port\n"); @@ -102,12 +96,11 @@ EXPORT_SYMBOL(hnae3_register_client); void hnae3_unregister_client(struct hnae3_client *client) { struct hnae3_ae_dev *ae_dev; - bool matched; mutex_lock(&hnae3_common_lock); /* un-initialize the client on every matched port */ list_for_each_entry(ae_dev, &hnae3_ae_dev_list, node) { - hnae3_match_n_instantiate(client, ae_dev, false, &matched); + hnae3_match_n_instantiate(client, ae_dev, false); } list_del(&client->node); @@ -124,7 +117,6 @@ int hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo) const struct pci_device_id *id; struct hnae3_ae_dev *ae_dev; struct hnae3_client *client; - bool matched; int ret = 0; mutex_lock(&hnae3_common_lock); @@ -151,13 +143,10 @@ int hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo) * initialize the figure out client instance */ list_for_each_entry(client, &hnae3_client_list, node) { - ret = hnae3_match_n_instantiate(client, ae_dev, true, - &matched); + ret = hnae3_match_n_instantiate(client, ae_dev, true); if (ret) dev_err(&ae_dev->pdev->dev, "match and instantiation failed\n"); - if (matched) - break; } } @@ -175,7 +164,6 @@ void hnae3_unregister_ae_algo(struct hnae3_ae_algo *ae_algo) const struct pci_device_id *id; struct hnae3_ae_dev *ae_dev; struct hnae3_client *client; - bool matched; mutex_lock(&hnae3_common_lock); /* Check if there are matched ae_dev */ @@ -187,12 +175,8 @@ void hnae3_unregister_ae_algo(struct hnae3_ae_algo *ae_algo) /* check the client list for the match with this ae_dev type and * un-initialize the figure out client instance */ - list_for_each_entry(client, &hnae3_client_list, node) { - hnae3_match_n_instantiate(client, ae_dev, false, - &matched); - if (matched) - break; - } + list_for_each_entry(client, &hnae3_client_list, node) + hnae3_match_n_instantiate(client, ae_dev, false); ae_algo->ops->uninit_ae_dev(ae_dev); hnae_set_bit(ae_dev->flag, HNAE3_DEV_INITED_B, 0); @@ -212,7 +196,6 @@ int hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev) const struct pci_device_id *id; struct hnae3_ae_algo *ae_algo; struct hnae3_client *client; - bool matched; int ret = 0; mutex_lock(&hnae3_common_lock); @@ -246,13 +229,10 @@ int hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev) * initialize the figure out client instance */ list_for_each_entry(client, &hnae3_client_list, node) { - ret = hnae3_match_n_instantiate(client, ae_dev, true, - &matched); + ret = hnae3_match_n_instantiate(client, ae_dev, true); if (ret) dev_err(&ae_dev->pdev->dev, "match and instantiation failed\n"); - if (matched) - break; } out_err: @@ -270,7 +250,6 @@ void hnae3_unregister_ae_dev(struct hnae3_ae_dev *ae_dev) const struct pci_device_id *id; struct hnae3_ae_algo *ae_algo; struct hnae3_client *client; - bool matched; mutex_lock(&hnae3_common_lock); /* Check if there are matched ae_algo */ @@ -279,12 +258,8 @@ void hnae3_unregister_ae_dev(struct hnae3_ae_dev *ae_dev) if (!id) continue; - list_for_each_entry(client, &hnae3_client_list, node) { - hnae3_match_n_instantiate(client, ae_dev, false, - &matched); - if (matched) - break; - } + list_for_each_entry(client, &hnae3_client_list, node) + hnae3_match_n_instantiate(client, ae_dev, false); ae_algo->ops->uninit_ae_dev(ae_dev); hnae_set_bit(ae_dev->flag, HNAE3_DEV_INITED_B, 0); -- cgit From b5b7db8d680464b1d631fd016f5e093419f0bfd9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 10:05:57 -0700 Subject: tcp: fastopen: fix on syn-data transmit failure Our recent change exposed a bug in TCP Fastopen Client that syzkaller found right away [1] When we prepare skb with SYN+DATA, we attempt to transmit it, and we update socket state as if the transmit was a success. In socket RTX queue we have two skbs, one with the SYN alone, and a second one containing the DATA. When (malicious) ACK comes in, we now complain that second one had no skb_mstamp. The proper fix is to make sure that if the transmit failed, we do not pretend we sent the DATA skb, and make it our send_head. When 3WHS completes, we can now send the DATA right away, without having to wait for a timeout. [1] WARNING: CPU: 0 PID: 100189 at net/ipv4/tcp_input.c:3117 tcp_clean_rtx_queue+0x2057/0x2ab0 net/ipv4/tcp_input.c:3117() WARN_ON_ONCE(last_ackt == 0); Modules linked in: CPU: 0 PID: 100189 Comm: syz-executor1 Not tainted Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 0000000000000000 ffff8800b35cb1d8 ffffffff81cad00d 0000000000000000 ffffffff828a4347 ffff88009f86c080 ffffffff8316eb20 0000000000000d7f ffff8800b35cb220 ffffffff812c33c2 ffff8800baad2440 00000009d46575c0 Call Trace: [] __dump_stack [] dump_stack+0xc1/0x124 [] warn_slowpath_common+0xe2/0x150 [] warn_slowpath_null+0x2e/0x40 [] tcp_clean_rtx_queue+0x2057/0x2ab0 n [] tcp_ack+0x151d/0x3930 [] tcp_rcv_state_process+0x1c69/0x4fd0 [] tcp_v4_do_rcv+0x54f/0x7c0 [] sk_backlog_rcv [] __release_sock+0x12b/0x3a0 [] release_sock+0x5e/0x1c0 [] inet_wait_for_connect [] __inet_stream_connect+0x545/0xc50 [] tcp_sendmsg_fastopen [] tcp_sendmsg+0x2298/0x35a0 [] inet_sendmsg+0xe5/0x520 [] sock_sendmsg_nosec [] sock_sendmsg+0xcf/0x110 Fixes: 8c72c65b426b ("tcp: update skb->skb_mstamp more carefully") Fixes: 783237e8daf1 ("net-tcp: Fast Open client - sending SYN-data") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 517d737059d1..0bc9e46a5369 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3389,6 +3389,10 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) goto done; } + /* data was not sent, this is our new send_head */ + sk->sk_send_head = syn_data; + tp->packets_out -= tcp_skb_pcount(syn_data); + fallback: /* Send a regular SYN with Fast Open cookie request option */ if (fo->cookie.len > 0) @@ -3441,6 +3445,11 @@ int tcp_connect(struct sock *sk) */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; + buff = tcp_send_head(sk); + if (unlikely(buff)) { + tp->snd_nxt = TCP_SKB_CB(buff)->seq; + tp->pushed_seq = TCP_SKB_CB(buff)->seq; + } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ -- cgit From f55956065ec94e3e9371463d693a1029c4cc3007 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Tue, 19 Sep 2017 19:35:18 +0200 Subject: net: emac: Fix napi poll list corruption This patch is pretty much a carbon copy of commit 3079c652141f ("caif: Fix napi poll list corruption") with "caif" replaced by "emac". The commit d75b1ade567f ("net: less interrupt masking in NAPI") breaks emac. It is now required that if the entire budget is consumed when poll returns, the napi poll_list must remain empty. However, like some other drivers emac tries to do a last-ditch check and if there is more work it will call napi_reschedule and then immediately process some of this new work. Should the entire budget be consumed while processing such new work then we will violate the new caller contract. This patch fixes this by not touching any work when we reschedule in emac. Signed-off-by: Christian Lamparter Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/emac/mal.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/emac/mal.c b/drivers/net/ethernet/ibm/emac/mal.c index 2c74baa2398a..fff09dcf9e34 100644 --- a/drivers/net/ethernet/ibm/emac/mal.c +++ b/drivers/net/ethernet/ibm/emac/mal.c @@ -402,7 +402,7 @@ static int mal_poll(struct napi_struct *napi, int budget) unsigned long flags; MAL_DBG2(mal, "poll(%d)" NL, budget); - again: + /* Process TX skbs */ list_for_each(l, &mal->poll_list) { struct mal_commac *mc = @@ -451,7 +451,6 @@ static int mal_poll(struct napi_struct *napi, int budget) spin_lock_irqsave(&mal->lock, flags); mal_disable_eob_irq(mal); spin_unlock_irqrestore(&mal->lock, flags); - goto again; } mc->ops->poll_tx(mc->dev); } -- cgit From 7c30013133964aaa2f45c17d6e9782ac6cfd7f5f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Sep 2017 00:44:21 +0200 Subject: bpf: fix ri->map_owner pointer on bpf_prog_realloc Commit 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") passed the pointer to the prog itself to be loaded into r4 prior on bpf_redirect_map() helper call, so that we can store the owner into ri->map_owner out of the helper. Issue with that is that the actual address of the prog is still subject to change when subsequent rewrites occur that require slow path in bpf_prog_realloc() to alloc more memory, e.g. from patching inlining helper functions or constant blinding. Thus, we really need to take prog->aux as the address we're holding, which also works with prog clones as they share the same aux object. Instead of then fetching aux->prog during runtime, which could potentially incur cache misses due to false sharing, we are going to just use aux for comparison on the map owner. This will also keep the patchlet of the same size, and later check in xdp_map_invalid() only accesses read-only aux pointer from the prog, it's also in the same cacheline already from prior access when calling bpf_func. Fixes: 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 7 ++++++- net/core/filter.c | 24 +++++++++++++++--------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 799b2451ef2d..b914fbe1383e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4205,7 +4205,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) } if (insn->imm == BPF_FUNC_redirect_map) { - u64 addr = (unsigned long)prog; + /* Note, we cannot use prog directly as imm as subsequent + * rewrites would still change the prog pointer. The only + * stable address we can use is aux, which also works with + * prog clones during blinding. + */ + u64 addr = (unsigned long)prog->aux; struct bpf_insn r4_ld[] = { BPF_LD_IMM64(BPF_REG_4, addr), *insn, diff --git a/net/core/filter.c b/net/core/filter.c index 24dd33dd9f04..82edad58d066 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1794,7 +1794,7 @@ struct redirect_info { u32 flags; struct bpf_map *map; struct bpf_map *map_to_flush; - const struct bpf_prog *map_owner; + unsigned long map_owner; }; static DEFINE_PER_CPU(struct redirect_info, redirect_info); @@ -2500,11 +2500,17 @@ void xdp_do_flush_map(void) } EXPORT_SYMBOL_GPL(xdp_do_flush_map); +static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, + unsigned long aux) +{ + return (unsigned long)xdp_prog->aux != aux; +} + static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); - const struct bpf_prog *map_owner = ri->map_owner; + unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; struct net_device *fwd = NULL; u32 index = ri->ifindex; @@ -2512,9 +2518,9 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ri->ifindex = 0; ri->map = NULL; - ri->map_owner = NULL; + ri->map_owner = 0; - if (unlikely(map_owner != xdp_prog)) { + if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { err = -EFAULT; map = NULL; goto err; @@ -2574,7 +2580,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); - const struct bpf_prog *map_owner = ri->map_owner; + unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; struct net_device *fwd = NULL; u32 index = ri->ifindex; @@ -2583,10 +2589,10 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, ri->ifindex = 0; ri->map = NULL; - ri->map_owner = NULL; + ri->map_owner = 0; if (map) { - if (unlikely(map_owner != xdp_prog)) { + if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { err = -EFAULT; map = NULL; goto err; @@ -2632,7 +2638,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; ri->map = NULL; - ri->map_owner = NULL; + ri->map_owner = 0; return XDP_REDIRECT; } @@ -2646,7 +2652,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { }; BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, - const struct bpf_prog *, map_owner) + unsigned long, map_owner) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); -- cgit From 6819a14ecbe2e089e5c5bb74edecafdde2028a00 Mon Sep 17 00:00:00 2001 From: Mike Manning Date: Mon, 4 Sep 2017 15:52:55 +0100 Subject: net: ipv6: fix regression of no RTM_DELADDR sent after DAD failure Commit f784ad3d79e5 ("ipv6: do not send RTM_DELADDR for tentative addresses") incorrectly assumes that no RTM_NEWADDR are sent for addresses in tentative state, as this does happen for the standard IPv6 use-case of DAD failure, see the call to ipv6_ifa_notify() in addconf_dad_stop(). So as a result of this change, no RTM_DELADDR is sent after DAD failure for a link-local when strict DAD (accept_dad=2) is configured, or on the next admin down in other cases. The absence of this notification breaks backwards compatibility and causes problems after DAD failure if this notification was being relied on. The solution is to allow RTM_DELADDR to still be sent after DAD failure. Fixes: f784ad3d79e5 ("ipv6: do not send RTM_DELADDR for tentative addresses") Signed-off-by: Mike Manning Cc: Mahesh Bandewar Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c2e2a78787ec..d7dbcc8eda10 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4940,9 +4940,10 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) /* Don't send DELADDR notification for TENTATIVE address, * since NEWADDR notification is sent only after removing - * TENTATIVE flag. + * TENTATIVE flag, if DAD has not failed. */ - if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR) + if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_DADFAILED) && + event == RTM_DELADDR) return; skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); -- cgit From 35e015e1f5773417952fe91ce8790baf9b4237a2 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 12 Sep 2017 17:46:37 +0200 Subject: ipv6: fix net.ipv6.conf.all interface DAD handlers Currently, writing into net.ipv6.conf.all.{accept_dad,use_optimistic,optimistic_dad} has no effect. Fix handling of these flags by: - using the maximum of global and per-interface values for the accept_dad flag. That is, if at least one of the two values is non-zero, enable DAD on the interface. If at least one value is set to 2, enable DAD and disable IPv6 operation on the interface if MAC-based link-local address was found - using the logical OR of global and per-interface values for the optimistic_dad flag. If at least one of them is set to one, optimistic duplicate address detection (RFC 4429) is enabled on the interface - using the logical OR of global and per-interface values for the use_optimistic flag. If at least one of them is set to one, optimistic addresses won't be marked as deprecated during source address selection on the interface. While at it, as we're modifying the prototype for ipv6_use_optimistic_addr(), drop inline, and let the compiler decide. Fixes: 7fd2561e4ebd ("net: ipv6: Add a sysctl to make optimistic addresses useful candidates") Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 18 ++++++++++++++---- net/ipv6/addrconf.c | 27 ++++++++++++++++++++------- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index b3345d0fe0a6..77f4de59dc9c 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1680,6 +1680,9 @@ accept_dad - INTEGER 2: Enable DAD, and disable IPv6 operation if MAC-based duplicate link-local address has been found. + DAD operation and mode on a given interface will be selected according + to the maximum value of conf/{all,interface}/accept_dad. + force_tllao - BOOLEAN Enable sending the target link-layer address option even when responding to a unicast neighbor solicitation. @@ -1727,16 +1730,23 @@ suppress_frag_ndisc - INTEGER optimistic_dad - BOOLEAN Whether to perform Optimistic Duplicate Address Detection (RFC 4429). - 0: disabled (default) - 1: enabled + 0: disabled (default) + 1: enabled + + Optimistic Duplicate Address Detection for the interface will be enabled + if at least one of conf/{all,interface}/optimistic_dad is set to 1, + it will be disabled otherwise. use_optimistic - BOOLEAN If enabled, do not classify optimistic addresses as deprecated during source address selection. Preferred addresses will still be chosen before optimistic addresses, subject to other ranking in the source address selection algorithm. - 0: disabled (default) - 1: enabled + 0: disabled (default) + 1: enabled + + This will be enabled if at least one of + conf/{all,interface}/use_optimistic is set to 1, disabled otherwise. stable_secret - IPv6 address This IPv6 address will be used as a secret to generate IPv6 diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d7dbcc8eda10..96861c702c06 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1399,10 +1399,18 @@ static inline int ipv6_saddr_preferred(int type) return 0; } -static inline bool ipv6_use_optimistic_addr(struct inet6_dev *idev) +static bool ipv6_use_optimistic_addr(struct net *net, + struct inet6_dev *idev) { #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - return idev && idev->cnf.optimistic_dad && idev->cnf.use_optimistic; + if (!idev) + return false; + if (!net->ipv6.devconf_all->optimistic_dad && !idev->cnf.optimistic_dad) + return false; + if (!net->ipv6.devconf_all->use_optimistic && !idev->cnf.use_optimistic) + return false; + + return true; #else return false; #endif @@ -1472,7 +1480,7 @@ static int ipv6_get_saddr_eval(struct net *net, /* Rule 3: Avoid deprecated and optimistic addresses */ u8 avoid = IFA_F_DEPRECATED; - if (!ipv6_use_optimistic_addr(score->ifa->idev)) + if (!ipv6_use_optimistic_addr(net, score->ifa->idev)) avoid |= IFA_F_OPTIMISTIC; ret = ipv6_saddr_preferred(score->addr_type) || !(score->ifa->flags & avoid); @@ -2460,7 +2468,8 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, int max_addresses = in6_dev->cnf.max_addresses; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if (in6_dev->cnf.optimistic_dad && + if ((net->ipv6.devconf_all->optimistic_dad || + in6_dev->cnf.optimistic_dad) && !net->ipv6.devconf_all->forwarding && sllao) addr_flags |= IFA_F_OPTIMISTIC; #endif @@ -3051,7 +3060,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev, u32 addr_flags = flags | IFA_F_PERMANENT; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if (idev->cnf.optimistic_dad && + if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad || + idev->cnf.optimistic_dad) && !dev_net(idev->dev)->ipv6.devconf_all->forwarding) addr_flags |= IFA_F_OPTIMISTIC; #endif @@ -3810,6 +3820,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) goto out; if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || + dev_net(dev)->ipv6.devconf_all->accept_dad < 1 || idev->cnf.accept_dad < 1 || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { @@ -3841,7 +3852,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) */ if (ifp->flags & IFA_F_OPTIMISTIC) { ip6_ins_rt(ifp->rt); - if (ipv6_use_optimistic_addr(idev)) { + if (ipv6_use_optimistic_addr(dev_net(dev), idev)) { /* Because optimistic nodes can use this address, * notify listeners. If DAD fails, RTM_DELADDR is sent. */ @@ -3897,7 +3908,9 @@ static void addrconf_dad_work(struct work_struct *w) action = DAD_ABORT; ifp->state = INET6_IFADDR_STATE_POSTDAD; - if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6 && + if ((dev_net(idev->dev)->ipv6.devconf_all->accept_dad > 1 || + idev->cnf.accept_dad > 1) && + !idev->cnf.disable_ipv6 && !(ifp->flags & IFA_F_STABLE_PRIVACY)) { struct in6_addr addr; -- cgit From 008ba2a13f2d04c947adc536d19debb8fe66f110 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 14 Sep 2017 17:14:41 -0400 Subject: packet: hold bind lock when rebinding to fanout hook Packet socket bind operations must hold the po->bind_lock. This keeps po->running consistent with whether the socket is actually on a ptype list to receive packets. fanout_add unbinds a socket and its packet_rcv/tpacket_rcv call, then binds the fanout object to receive through packet_rcv_fanout. Make it hold the po->bind_lock when testing po->running and rebinding. Else, it can race with other rebind operations, such as that in packet_set_ring from packet_rcv to tpacket_rcv. Concurrent updates can result in a socket being added to a fanout group twice, causing use-after-free KASAN bug reports, among others. Reported independently by both trinity and syzkaller. Verified that the syzkaller reproducer passes after this patch. Fixes: dc99f600698d ("packet: Add fanout support.") Reported-by: nixioaming Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c26172995511..d288f52c53f7 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1684,10 +1684,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) mutex_lock(&fanout_mutex); - err = -EINVAL; - if (!po->running) - goto out; - err = -EALREADY; if (po->fanout) goto out; @@ -1749,7 +1745,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) list_add(&match->list, &fanout_list); } err = -EINVAL; - if (match->type == type && + + spin_lock(&po->bind_lock); + if (po->running && + match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; @@ -1761,6 +1760,13 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) err = 0; } } + spin_unlock(&po->bind_lock); + + if (err && !refcount_read(&match->sk_ref)) { + list_del(&match->list); + kfree(match); + } + out: if (err && rollover) { kfree(rollover); -- cgit From ec9dd352d591f0c90402ec67a317c1ed4fb2e638 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 18 Sep 2017 16:38:36 -0700 Subject: bpf: one perf event close won't free bpf program attached by another perf event This patch fixes a bug exhibited by the following scenario: 1. fd1 = perf_event_open with attr.config = ID1 2. attach bpf program prog1 to fd1 3. fd2 = perf_event_open with attr.config = ID1 4. user program closes fd2 and prog1 is detached from the tracepoint. 5. user program with fd1 does not work properly as tracepoint no output any more. The issue happens at step 4. Multiple perf_event_open can be called successfully, but only one bpf prog pointer in the tp_event. In the current logic, any fd release for the same tp_event will free the tp_event->prog. The fix is to free tp_event->prog only when the closing fd corresponds to the one which registered the program. Signed-off-by: Yonghong Song Signed-off-by: David S. Miller --- include/linux/trace_events.h | 1 + kernel/events/core.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 7f11050746ae..2e0f22298fe9 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -272,6 +272,7 @@ struct trace_event_call { int perf_refcount; struct hlist_head __percpu *perf_events; struct bpf_prog *prog; + struct perf_event *bpf_prog_owner; int (*perf_perm)(struct trace_event_call *, struct perf_event *); diff --git a/kernel/events/core.c b/kernel/events/core.c index 3e691b75b2db..6bc21e202ae4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8171,6 +8171,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) } } event->tp_event->prog = prog; + event->tp_event->bpf_prog_owner = event; return 0; } @@ -8185,7 +8186,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) return; prog = event->tp_event->prog; - if (prog) { + if (prog && event->tp_event->bpf_prog_owner == event) { event->tp_event->prog = NULL; bpf_prog_put(prog); } -- cgit From c2a64bb9fcd31c39feddf30748b4ee8d82e53c6a Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 19 Sep 2017 13:19:13 -0400 Subject: net: compat: assert the size of cmsg copied in is as expected The actual length of cmsg fetched in during the second loop (i.e., kcmsg - kcmsg_base) could be different from what we get from the first loop (i.e., kcmlen). The main reason is that the two get_user() calls in the two loops (i.e., get_user(ucmlen, &ucmsg->cmsg_len) and __get_user(ucmlen, &ucmsg->cmsg_len)) could cause ucmlen to have different values even they fetch from the same userspace address, as user can race to change the memory content in &ucmsg->cmsg_len across fetches. Although in the second loop, the sanity check if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp)) is inplace, it only ensures that the cmsg fetched in during the second loop does not exceed the length of kcmlen, but not necessarily equal to kcmlen. But indicated by the assignment kmsg->msg_controllen = kcmlen, we should enforce that. This patch adds this additional sanity check and ensures that what is recorded in kmsg->msg_controllen is the actual cmsg length. Signed-off-by: Meng Xu Signed-off-by: David S. Miller --- net/compat.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/compat.c b/net/compat.c index 6ded6c821d7a..22381719718c 100644 --- a/net/compat.c +++ b/net/compat.c @@ -185,6 +185,13 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen); } + /* + * check the length of messages copied in is the same as the + * what we get from the first loop + */ + if ((char *)kcmsg - (char *)kcmsg_base != kcmlen) + goto Einval; + /* Ok, looks like we made it. Hook it up and return success. */ kmsg->msg_control = kcmsg_base; kmsg->msg_controllen = kcmlen; -- cgit From 92dd5452c1be873a1193561f4f691763103d22ac Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 19 Sep 2017 18:45:56 +0100 Subject: net: change skb->mac_header when Generic XDP calls adjust_head Since XDP's view of the packet includes the MAC header, moving the start- of-packet with bpf_xdp_adjust_head needs to also update the offset of the MAC header (which is relative to skb->head, not to the skb->data that was changed). Without this, tcpdump sees packets starting from the old MAC header rather than the new one, at least in my tests on the loopback device. Fixes: b5cdae3291f7 ("net: Generic XDP") Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/dev.c b/net/core/dev.c index fb766d906148..9a2254f9802f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3892,6 +3892,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, __skb_pull(skb, off); else if (off < 0) __skb_push(skb, -off); + skb->mac_header += off; switch (act) { case XDP_REDIRECT: -- cgit From 5e62d98c4bbc243f3dca18e73a754b629839fc5c Mon Sep 17 00:00:00 2001 From: Troy Kisky Date: Tue, 19 Sep 2017 17:33:07 -0700 Subject: net: fec: only check queue 0 if RXF_0/TXF_0 interrupt is set Before queue 0 was always checked if any queue caused an interrupt. It is better to just mark queue 0 if queue 0 has caused an interrupt. Signed-off-by: Troy Kisky Acked-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 56f56d6ada9c..464055fb33d5 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1559,14 +1559,14 @@ fec_enet_collect_events(struct fec_enet_private *fep, uint int_events) if (int_events == 0) return false; - if (int_events & FEC_ENET_RXF) + if (int_events & FEC_ENET_RXF_0) fep->work_rx |= (1 << 2); if (int_events & FEC_ENET_RXF_1) fep->work_rx |= (1 << 0); if (int_events & FEC_ENET_RXF_2) fep->work_rx |= (1 << 1); - if (int_events & FEC_ENET_TXF) + if (int_events & FEC_ENET_TXF_0) fep->work_tx |= (1 << 2); if (int_events & FEC_ENET_TXF_1) fep->work_tx |= (1 << 0); -- cgit From 7063c163cd4a20184b3bbada503dab8f254a8c56 Mon Sep 17 00:00:00 2001 From: Troy Kisky Date: Tue, 19 Sep 2017 17:33:08 -0700 Subject: net: fec: remove unused interrupt FEC_ENET_TS_TIMER FEC_ENET_TS_TIMER is not checked in the interrupt routine so there is no need to enable it. Signed-off-by: Troy Kisky Acked-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h index 38c7b21e5d63..ede1876a9a19 100644 --- a/drivers/net/ethernet/freescale/fec.h +++ b/drivers/net/ethernet/freescale/fec.h @@ -374,8 +374,8 @@ struct bufdesc_ex { #define FEC_ENET_TS_AVAIL ((uint)0x00010000) #define FEC_ENET_TS_TIMER ((uint)0x00008000) -#define FEC_DEFAULT_IMASK (FEC_ENET_TXF | FEC_ENET_RXF | FEC_ENET_MII | FEC_ENET_TS_TIMER) -#define FEC_NAPI_IMASK (FEC_ENET_MII | FEC_ENET_TS_TIMER) +#define FEC_DEFAULT_IMASK (FEC_ENET_TXF | FEC_ENET_RXF | FEC_ENET_MII) +#define FEC_NAPI_IMASK FEC_ENET_MII #define FEC_RX_DISABLED_IMASK (FEC_DEFAULT_IMASK & (~FEC_ENET_RXF)) /* ENET interrupt coalescing macro define */ -- cgit From e24ee2780a1d6786b94139ebf95b44c9ae62bf13 Mon Sep 17 00:00:00 2001 From: Troy Kisky Date: Tue, 19 Sep 2017 17:33:09 -0700 Subject: net: fec: return IRQ_HANDLED if fec_ptp_check_pps_event handled it fec_ptp_check_pps_event will return 1 if FEC_T_TF_MASK caused an interrupt. Don't return IRQ_NONE in this case. Signed-off-by: Troy Kisky Acked-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 464055fb33d5..3dc2d771a222 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1604,8 +1604,8 @@ fec_enet_interrupt(int irq, void *dev_id) } if (fep->ptp_clock) - fec_ptp_check_pps_event(fep); - + if (fec_ptp_check_pps_event(fep)) + ret = IRQ_HANDLED; return ret; } -- cgit From 02388bf87f72e1d47174cd8f81c34443920eb5a0 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 19 Sep 2017 21:49:55 -0400 Subject: isdn/i4l: fetch the ppp_write buffer in one shot In isdn_ppp_write(), the header (i.e., protobuf) of the buffer is fetched twice from userspace. The first fetch is used to peek at the protocol of the message and reset the huptimer if necessary; while the second fetch copies in the whole buffer. However, given that buf resides in userspace memory, a user process can race to change its memory content across fetches. By doing so, we can either avoid resetting the huptimer for any type of packets (by first setting proto to PPP_LCP and later change to the actual type) or force resetting the huptimer for LCP packets. This patch changes this double-fetch behavior into two single fetches decided by condition (lp->isdn_device < 0 || lp->isdn_channel <0). A more detailed discussion can be found at https://marc.info/?l=linux-kernel&m=150586376926123&w=2 Signed-off-by: Meng Xu Signed-off-by: David S. Miller --- drivers/isdn/i4l/isdn_ppp.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c index 6c44609fd83a..cd2b3c69771a 100644 --- a/drivers/isdn/i4l/isdn_ppp.c +++ b/drivers/isdn/i4l/isdn_ppp.c @@ -825,7 +825,6 @@ isdn_ppp_write(int min, struct file *file, const char __user *buf, int count) isdn_net_local *lp; struct ippp_struct *is; int proto; - unsigned char protobuf[4]; is = file->private_data; @@ -839,24 +838,28 @@ isdn_ppp_write(int min, struct file *file, const char __user *buf, int count) if (!lp) printk(KERN_DEBUG "isdn_ppp_write: lp == NULL\n"); else { - /* - * Don't reset huptimer for - * LCP packets. (Echo requests). - */ - if (copy_from_user(protobuf, buf, 4)) - return -EFAULT; - proto = PPP_PROTOCOL(protobuf); - if (proto != PPP_LCP) - lp->huptimer = 0; + if (lp->isdn_device < 0 || lp->isdn_channel < 0) { + unsigned char protobuf[4]; + /* + * Don't reset huptimer for + * LCP packets. (Echo requests). + */ + if (copy_from_user(protobuf, buf, 4)) + return -EFAULT; + + proto = PPP_PROTOCOL(protobuf); + if (proto != PPP_LCP) + lp->huptimer = 0; - if (lp->isdn_device < 0 || lp->isdn_channel < 0) return 0; + } if ((dev->drv[lp->isdn_device]->flags & DRV_FLAG_RUNNING) && lp->dialstate == 0 && (lp->flags & ISDN_NET_CONNECTED)) { unsigned short hl; struct sk_buff *skb; + unsigned char *cpy_buf; /* * we need to reserve enough space in front of * sk_buff. old call to dev_alloc_skb only reserved @@ -869,11 +872,21 @@ isdn_ppp_write(int min, struct file *file, const char __user *buf, int count) return count; } skb_reserve(skb, hl); - if (copy_from_user(skb_put(skb, count), buf, count)) + cpy_buf = skb_put(skb, count); + if (copy_from_user(cpy_buf, buf, count)) { kfree_skb(skb); return -EFAULT; } + + /* + * Don't reset huptimer for + * LCP packets. (Echo requests). + */ + proto = PPP_PROTOCOL(cpy_buf); + if (proto != PPP_LCP) + lp->huptimer = 0; + if (is->debug & 0x40) { printk(KERN_DEBUG "ppp xmit: len %d\n", (int) skb->len); isdn_ppp_frame_log("xmit", skb->data, skb->len, 32, is->unit, lp->ppp_slot); -- cgit From e92a0843795779678397ac0790a76de20f79cc13 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:50 +0800 Subject: net: hns3: Cleanup for ROCE capability flag in ae_dev This patch add the ROCE supported flag in the driver_data field of pci_device_id, delete roce_pci_tbl and change HNAE_DEV_SUPPORT_ROCE_B to HNAE3_DEV_SUPPORT_ROCE_B. This cleanup is done in order to support adding capability in pci_device_id and to fix initialization failure when cmd is not supported. Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 5 ++++- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 25 ++++------------------ .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 16 +++++++++----- 3 files changed, 19 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index b2f28ae81273..0f7b61a92f44 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -49,7 +49,10 @@ #define HNAE3_CLASS_NAME_SIZE 16 #define HNAE3_DEV_INITED_B 0x0 -#define HNAE_DEV_SUPPORT_ROCE_B 0x1 +#define HNAE3_DEV_SUPPORT_ROCE_B 0x1 + +#define hnae3_dev_roce_supported(hdev) \ + hnae_get_bit(hdev->ae_dev->flag, HNAE3_DEV_SUPPORT_ROCE_B) #define ring_ptr_move_fw(ring, p) \ ((ring)->p = ((ring)->p + 1) % (ring)->desc_num) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 74008ef23169..6953d19c6475 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -46,17 +46,7 @@ static const struct pci_device_id ae_algo_pci_tbl[] = { {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), 0}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), 0}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), 0}, - /* Required last entry */ - {0, } -}; - -static const struct pci_device_id roce_pci_tbl[] = { - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), 0}, - /* Required last entry */ + /* required last entry */ {0, } }; @@ -894,7 +884,7 @@ static int hclge_query_pf_resource(struct hclge_dev *hdev) hdev->num_tqps = __le16_to_cpu(req->tqp_num); hdev->pkt_buf_size = __le16_to_cpu(req->buf_size) << HCLGE_BUF_UNIT_S; - if (hnae_get_bit(hdev->ae_dev->flag, HNAE_DEV_SUPPORT_ROCE_B)) { + if (hnae3_dev_roce_supported(hdev)) { hdev->num_roce_msix = hnae_get_field(__le16_to_cpu(req->pf_intr_vector_number), HCLGE_PF_VEC_NUM_M, HCLGE_PF_VEC_NUM_S); @@ -3932,8 +3922,7 @@ static int hclge_init_client_instance(struct hnae3_client *client, goto err; if (hdev->roce_client && - hnae_get_bit(hdev->ae_dev->flag, - HNAE_DEV_SUPPORT_ROCE_B)) { + hnae3_dev_roce_supported(hdev)) { struct hnae3_client *rc = hdev->roce_client; ret = hclge_init_roce_base_info(vport); @@ -3956,8 +3945,7 @@ static int hclge_init_client_instance(struct hnae3_client *client, break; case HNAE3_CLIENT_ROCE: - if (hnae_get_bit(hdev->ae_dev->flag, - HNAE_DEV_SUPPORT_ROCE_B)) { + if (hnae3_dev_roce_supported(hdev)) { hdev->roce_client = client; vport->roce.client = client; } @@ -4069,7 +4057,6 @@ static void hclge_pci_uninit(struct hclge_dev *hdev) static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) { struct pci_dev *pdev = ae_dev->pdev; - const struct pci_device_id *id; struct hclge_dev *hdev; int ret; @@ -4084,10 +4071,6 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) hdev->ae_dev = ae_dev; ae_dev->priv = hdev; - id = pci_match_id(roce_pci_tbl, ae_dev->pdev); - if (id) - hnae_set_bit(ae_dev->flag, HNAE_DEV_SUPPORT_ROCE_B, 1); - ret = hclge_pci_init(hdev); if (ret) { dev_err(&pdev->dev, "PCI init failed\n"); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c index 4d68d6ea5143..94d8bb5b92f0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c @@ -41,11 +41,16 @@ static struct hnae3_client client; static const struct pci_device_id hns3_pci_tbl[] = { {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_GE), 0}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), 0}, + {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), + BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), + BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), + BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), + BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), + BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, /* required last entry */ {0, } }; @@ -1348,6 +1353,7 @@ static int hns3_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } ae_dev->pdev = pdev; + ae_dev->flag = ent->driver_data; ae_dev->dev_type = HNAE3_DEV_KNIC; pci_set_drvdata(pdev, ae_dev); -- cgit From 2daf4a6536f3109ed0ed758cec14743e0e5c20ea Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:51 +0800 Subject: net: hns3: Fix initialization when cmd is not supported When ae_dev doesn't support DCB, rx_priv_wl_config, common_thrd_config and tm_qs_bp_cfg can't be called, otherwise cmd return fail, which causes the hclge module initialization process to fail. This patch fix it by adding a DCB capability flag to check if the ae_dev support DCB. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 7 ++++++ .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 26 +++++++++++++--------- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 4 ++++ .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 10 ++++----- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 0f7b61a92f44..ad685f5aa6d1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -50,10 +50,17 @@ #define HNAE3_DEV_INITED_B 0x0 #define HNAE3_DEV_SUPPORT_ROCE_B 0x1 +#define HNAE3_DEV_SUPPORT_DCB_B 0x2 + +#define HNAE3_DEV_SUPPORT_ROCE_DCB_BITS (BIT(HNAE3_DEV_SUPPORT_DCB_B) |\ + BIT(HNAE3_DEV_SUPPORT_ROCE_B)) #define hnae3_dev_roce_supported(hdev) \ hnae_get_bit(hdev->ae_dev->flag, HNAE3_DEV_SUPPORT_ROCE_B) +#define hnae3_dev_dcb_supported(hdev) \ + hnae_get_bit(hdev->ae_dev->flag, HNAE3_DEV_SUPPORT_DCB_B) + #define ring_ptr_move_fw(ring, p) \ ((ring)->p = ((ring)->p + 1) % (ring)->desc_num) #define ring_ptr_move_bw(ring, p) \ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 6953d19c6475..903f43a8c2a1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1772,18 +1772,22 @@ int hclge_buffer_alloc(struct hclge_dev *hdev) return ret; } - ret = hclge_rx_priv_wl_config(hdev); - if (ret) { - dev_err(&hdev->pdev->dev, - "could not configure rx private waterline %d\n", ret); - return ret; - } + if (hnae3_dev_dcb_supported(hdev)) { + ret = hclge_rx_priv_wl_config(hdev); + if (ret) { + dev_err(&hdev->pdev->dev, + "could not configure rx private waterline %d\n", + ret); + return ret; + } - ret = hclge_common_thrd_config(hdev); - if (ret) { - dev_err(&hdev->pdev->dev, - "could not configure common threshold %d\n", ret); - return ret; + ret = hclge_common_thrd_config(hdev); + if (ret) { + dev_err(&hdev->pdev->dev, + "could not configure common threshold %d\n", + ret); + return ret; + } } ret = hclge_common_wl_config(hdev); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index 1c577d268f00..c91dbf19c4b1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -976,6 +976,10 @@ int hclge_pause_setup_hw(struct hclge_dev *hdev) if (ret) return ret; + /* Only DCB-supported dev supports qset back pressure setting */ + if (!hnae3_dev_dcb_supported(hdev)) + return 0; + for (i = 0; i < hdev->tm_info.num_tc; i++) { ret = hclge_tm_qs_bp_cfg(hdev, i); if (ret) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c index 94d8bb5b92f0..35369e1c8036 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c @@ -42,15 +42,15 @@ static const struct pci_device_id hns3_pci_tbl[] = { {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_GE), 0}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE), 0}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), - BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), - BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), - BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), - BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), - BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, /* required last entry */ {0, } }; -- cgit From d221df4e0faae2b9cc8ad78f3e5e777461b6b542 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:52 +0800 Subject: net: hns3: Fix for DEFAULT_DV when dev doesn't support DCB When ae_dev doesn't support DCB, DEFAULT_DV must be set to a lower value, otherwise the buffer allocation process will fail. This patch fix it by setting it to 30K bytes. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 1 + drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index c2b613b40509..30e2ad5ac0da 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -688,6 +688,7 @@ struct hclge_reset_tqp_queue { #define HCLGE_DEFAULT_TX_BUF 0x4000 /* 16k bytes */ #define HCLGE_TOTAL_PKT_BUF 0x108000 /* 1.03125M bytes */ #define HCLGE_DEFAULT_DV 0xA000 /* 40k byte */ +#define HCLGE_DEFAULT_NON_DCB_DV 0x7800 /* 30K byte */ #define HCLGE_TYPE_CRQ 0 #define HCLGE_TYPE_CSQ 1 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 903f43a8c2a1..796370adf99c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1444,7 +1444,11 @@ static bool hclge_is_rx_buf_ok(struct hclge_dev *hdev, u32 rx_all) tc_num = hclge_get_tc_num(hdev); pfc_enable_num = hclge_get_pfc_enalbe_num(hdev); - shared_buf_min = 2 * hdev->mps + HCLGE_DEFAULT_DV; + if (hnae3_dev_dcb_supported(hdev)) + shared_buf_min = 2 * hdev->mps + HCLGE_DEFAULT_DV; + else + shared_buf_min = 2 * hdev->mps + HCLGE_DEFAULT_NON_DCB_DV; + shared_buf_tc = pfc_enable_num * hdev->mps + (tc_num - pfc_enable_num) * hdev->mps / 2 + hdev->mps; -- cgit From bb1fe9ea6371e075d3d1448cd3ff6441d31307be Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:53 +0800 Subject: net: hns3: Fix for not setting rx private buffer size to zero When rx private buffer is disabled, there may be some case that the rx private buffer is not set to zero, which may cause buffer allocation process to fail. This patch fixes this problem by setting priv->enable to 0 and priv->buf_size to zero when rx private buffer is disabled. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 796370adf99c..a7d8fb1e15f6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1504,6 +1504,11 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size) priv->wl.high = 2 * hdev->mps; priv->buf_size = priv->wl.high; } + } else { + priv->enable = 0; + priv->wl.low = 0; + priv->wl.high = 0; + priv->buf_size = 0; } } @@ -1516,8 +1521,15 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size) for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { priv = &hdev->priv_buf[i]; - if (hdev->hw_tc_map & BIT(i)) - priv->enable = 1; + priv->enable = 0; + priv->wl.low = 0; + priv->wl.high = 0; + priv->buf_size = 0; + + if (!(hdev->hw_tc_map & BIT(i))) + continue; + + priv->enable = 1; if (hdev->tm_info.hw_pfc_map & BIT(i)) { priv->wl.low = 128; -- cgit From b8c8bf47da5576657370798da6f18a8cb0245d5b Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:54 +0800 Subject: net: hns3: Fix for rx_priv_buf_alloc not setting rx shared buffer rx_priv_buf_alloc is used to tell hardware how much buffer is used for rx direction, right now only the private buffer is assigned. For ae_dev that doesn't support DCB, private rx buffer is assigned to zero, only shared rx buffer is used. So not setting the shared rx buffer cause dropping of packet in SSU. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 3 ++- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index 30e2ad5ac0da..758cf3948131 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -270,7 +270,8 @@ struct hclge_tx_buff_alloc { struct hclge_rx_priv_buff { __le16 buf_num[HCLGE_TC_NUM]; - u8 rsv[8]; + __le16 shared_buf; + u8 rsv[6]; }; struct hclge_query_version { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index a7d8fb1e15f6..e313552bb23d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1622,6 +1622,10 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev) cpu_to_le16(true << HCLGE_TC0_PRI_BUF_EN_B); } + req->shared_buf = + cpu_to_le16((hdev->s_buf.buf_size >> HCLGE_BUF_UNIT_S) | + (1 << HCLGE_TC0_PRI_BUF_EN_B)); + ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) { dev_err(&hdev->pdev->dev, -- cgit From d602a52540c9b92e0dd152cfe1d0848c23f08894 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:55 +0800 Subject: net: hns3: Fix for rx priv buf allocation when DCB is not supported When hdev doesn't support DCB, rx private buffer is not allocated, otherwise there is not enough buffer for rx shared buffer, causing buffer allocation process to fail. This patch fixes by checking the dcb capability in hclge_rx_buffer_calc. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e313552bb23d..c660f0caf709 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1489,6 +1489,16 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size) struct hclge_priv_buf *priv; int i; + /* When DCB is not supported, rx private + * buffer is not allocated. + */ + if (!hnae3_dev_dcb_supported(hdev)) { + if (!hclge_is_rx_buf_ok(hdev, rx_all)) + return -ENOMEM; + + return 0; + } + /* step 1, try to alloc private buffer for all enabled tc */ for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { priv = &hdev->priv_buf[i]; -- cgit From c4726338d928c824f56c27734d837b8244132705 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:56 +0800 Subject: net: hns3: Fix typo error for feild in hclge_tm This patch fixes a typo error for feild, which should be field. Fixes: 848440544b41f ("net: hns3: Add support of TX Scheduler & Shaper to HNS3 driver") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 20 ++++++++++---------- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index c91dbf19c4b1..fe659f752237 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -280,11 +280,11 @@ static int hclge_tm_pg_shapping_cfg(struct hclge_dev *hdev, shap_cfg_cmd->pg_id = pg_id; - hclge_tm_set_feild(shap_cfg_cmd->pg_shapping_para, IR_B, ir_b); - hclge_tm_set_feild(shap_cfg_cmd->pg_shapping_para, IR_U, ir_u); - hclge_tm_set_feild(shap_cfg_cmd->pg_shapping_para, IR_S, ir_s); - hclge_tm_set_feild(shap_cfg_cmd->pg_shapping_para, BS_B, bs_b); - hclge_tm_set_feild(shap_cfg_cmd->pg_shapping_para, BS_S, bs_s); + hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_B, ir_b); + hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_U, ir_u); + hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_S, ir_s); + hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, BS_B, bs_b); + hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, BS_S, bs_s); return hclge_cmd_send(&hdev->hw, &desc, 1); } @@ -307,11 +307,11 @@ static int hclge_tm_pri_shapping_cfg(struct hclge_dev *hdev, shap_cfg_cmd->pri_id = pri_id; - hclge_tm_set_feild(shap_cfg_cmd->pri_shapping_para, IR_B, ir_b); - hclge_tm_set_feild(shap_cfg_cmd->pri_shapping_para, IR_U, ir_u); - hclge_tm_set_feild(shap_cfg_cmd->pri_shapping_para, IR_S, ir_s); - hclge_tm_set_feild(shap_cfg_cmd->pri_shapping_para, BS_B, bs_b); - hclge_tm_set_feild(shap_cfg_cmd->pri_shapping_para, BS_S, bs_s); + hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_B, ir_b); + hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_U, ir_u); + hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_S, ir_s); + hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, BS_B, bs_b); + hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, BS_S, bs_s); return hclge_cmd_send(&hdev->hw, &desc, 1); } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h index 7e67337dfaf2..85158b0d73fe 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h @@ -94,10 +94,10 @@ struct hclge_bp_to_qs_map_cmd { u32 rsvd1; }; -#define hclge_tm_set_feild(dest, string, val) \ +#define hclge_tm_set_field(dest, string, val) \ hnae_set_field((dest), (HCLGE_TM_SHAP_##string##_MSK), \ (HCLGE_TM_SHAP_##string##_LSH), val) -#define hclge_tm_get_feild(src, string) \ +#define hclge_tm_get_field(src, string) \ hnae_get_field((src), (HCLGE_TM_SHAP_##string##_MSK), \ (HCLGE_TM_SHAP_##string##_LSH)) -- cgit From 68ece54efd417d415462adbaa2700cba50de3ff6 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:57 +0800 Subject: net: hns3: Fix for setting rss_size incorrectly rss_size is 1, 2, 4, 8, 16, 32, 64, 128, but acutal tc queue size can be any u16 less than 128. If tc queue size is 5, we set the rss_size to 8, indirection table will be used to limit the size of actual queue size. It may cause dropping of receiving packet in hardware if rss_size is not set correctly. For now, each TC has the same rss size. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 76 ++++++++++------------ .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 1 + .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 1 + 3 files changed, 38 insertions(+), 40 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index c660f0caf709..e0685e630afe 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2606,6 +2606,7 @@ static int hclge_rss_init_hw(struct hclge_dev *hdev) u16 tc_valid[HCLGE_MAX_TC_NUM]; u16 tc_size[HCLGE_MAX_TC_NUM]; u32 *rss_indir = NULL; + u16 rss_size = 0, roundup_size; const u8 *key; int i, ret, j; @@ -2620,7 +2621,13 @@ static int hclge_rss_init_hw(struct hclge_dev *hdev) for (j = 0; j < hdev->num_vmdq_vport + 1; j++) { for (i = 0; i < HCLGE_RSS_IND_TBL_SIZE; i++) { vport[j].rss_indirection_tbl[i] = - i % hdev->rss_size_max; + i % vport[j].alloc_rss_size; + + /* vport 0 is for PF */ + if (j != 0) + continue; + + rss_size = vport[j].alloc_rss_size; rss_indir[i] = vport[j].rss_indirection_tbl[i]; } } @@ -2637,42 +2644,31 @@ static int hclge_rss_init_hw(struct hclge_dev *hdev) if (ret) goto err; + /* Each TC have the same queue size, and tc_size set to hardware is + * the log2 of roundup power of two of rss_size, the acutal queue + * size is limited by indirection table. + */ + if (rss_size > HCLGE_RSS_TC_SIZE_7 || rss_size == 0) { + dev_err(&hdev->pdev->dev, + "Configure rss tc size failed, invalid TC_SIZE = %d\n", + rss_size); + return -EINVAL; + } + + roundup_size = roundup_pow_of_two(rss_size); + roundup_size = ilog2(roundup_size); + for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { - if (hdev->hw_tc_map & BIT(i)) - tc_valid[i] = 1; - else - tc_valid[i] = 0; + tc_valid[i] = 0; - switch (hdev->rss_size_max) { - case HCLGE_RSS_TC_SIZE_0: - tc_size[i] = 0; - break; - case HCLGE_RSS_TC_SIZE_1: - tc_size[i] = 1; - break; - case HCLGE_RSS_TC_SIZE_2: - tc_size[i] = 2; - break; - case HCLGE_RSS_TC_SIZE_3: - tc_size[i] = 3; - break; - case HCLGE_RSS_TC_SIZE_4: - tc_size[i] = 4; - break; - case HCLGE_RSS_TC_SIZE_5: - tc_size[i] = 5; - break; - case HCLGE_RSS_TC_SIZE_6: - tc_size[i] = 6; - break; - case HCLGE_RSS_TC_SIZE_7: - tc_size[i] = 7; - break; - default: - break; - } - tc_offset[i] = hdev->rss_size_max * i; + if (!(hdev->hw_tc_map & BIT(i))) + continue; + + tc_valid[i] = 1; + tc_size[i] = roundup_size; + tc_offset[i] = rss_size * i; } + ret = hclge_set_rss_tc_mode(hdev, tc_valid, tc_size, tc_offset); err: @@ -4167,12 +4163,6 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) return ret; } - ret = hclge_rss_init_hw(hdev); - if (ret) { - dev_err(&pdev->dev, "Rss init fail, ret =%d\n", ret); - return ret; - } - ret = hclge_init_vlan_config(hdev); if (ret) { dev_err(&pdev->dev, "VLAN init fail, ret =%d\n", ret); @@ -4185,6 +4175,12 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) return ret; } + ret = hclge_rss_init_hw(hdev); + if (ret) { + dev_err(&pdev->dev, "Rss init fail, ret =%d\n", ret); + return ret; + } + setup_timer(&hdev->service_timer, hclge_service_timer, (unsigned long)hdev); INIT_WORK(&hdev->service_task, hclge_service_task); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index edb10ad075eb..7f8dd129c10d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -477,6 +477,7 @@ struct hclge_vport { u8 rss_hash_key[HCLGE_RSS_KEY_SIZE]; /* User configured hash keys */ /* User configured lookup table entries */ u8 rss_indirection_tbl[HCLGE_RSS_IND_TBL_SIZE]; + u16 alloc_rss_size; u16 qs_offset; u16 bw_limit; /* VSI BW Limit (0 = disabled) */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index fe659f752237..b7ba7aa66620 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -397,6 +397,7 @@ static void hclge_tm_vport_tc_info_update(struct hclge_vport *vport) kinfo->num_tqps / kinfo->num_tc); vport->qs_offset = hdev->tm_info.num_tc * vport->vport_id; vport->dwrr = 100; /* 100 percent as init */ + vport->alloc_rss_size = kinfo->rss_size; for (i = 0; i < kinfo->num_tc; i++) { if (hdev->hw_tc_map & BIT(i)) { -- cgit From c5795c5308af81568d1573598716091120c85a38 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:58 +0800 Subject: net: hns3: Fix for pri to tc mapping in TM Current mapping between pri and tc is one to one, so user can't map multi priorities to the same tc. This patch changes the mapping to many to one. Fixes: 848440544b41f ("net: hns3: Add support of TX Scheduler & Shaper to HNS3 driver") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 3 ++- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 16 +++++++++------- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index ad685f5aa6d1..1a01cadfe5f3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -376,12 +376,12 @@ struct hnae3_ae_algo { struct hnae3_tc_info { u16 tqp_offset; /* TQP offset from base TQP */ u16 tqp_count; /* Total TQPs */ - u8 up; /* user priority */ u8 tc; /* TC index */ bool enable; /* If this TC is enable or not */ }; #define HNAE3_MAX_TC 8 +#define HNAE3_MAX_USER_PRIO 8 struct hnae3_knic_private_info { struct net_device *netdev; /* Set by KNIC client when init instance */ u16 rss_size; /* Allocated RSS queues */ @@ -389,6 +389,7 @@ struct hnae3_knic_private_info { u16 num_desc; u8 num_tc; /* Total number of enabled TCs */ + u8 prio_tc[HNAE3_MAX_USER_PRIO]; /* TC indexed by prio */ struct hnae3_tc_info tc_info[HNAE3_MAX_TC]; /* Idx of array is HW TC */ u16 num_tqps; /* total number of TQPs in this handle */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 7f8dd129c10d..9fcfd9395424 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -176,7 +176,6 @@ struct hclge_pg_info { struct hclge_tc_info { u8 tc_id; u8 tc_sch_mode; /* 0: sp; 1: dwrr */ - u8 up; u8 pgid; u32 bw_limit; }; @@ -197,6 +196,7 @@ struct hclge_tm_info { u8 num_tc; u8 num_pg; /* It must be 1 if vNET-Base schd */ u8 pg_dwrr[HCLGE_PG_NUM]; + u8 prio_tc[HNAE3_MAX_USER_PRIO]; struct hclge_pg_info pg_info[HCLGE_PG_NUM]; struct hclge_tc_info tc_info[HNAE3_MAX_TC]; enum hclge_fc_mode fc_mode; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index b7ba7aa66620..73a75d7cc551 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -128,9 +128,7 @@ static int hclge_fill_pri_array(struct hclge_dev *hdev, u8 *pri, u8 pri_id) { u8 tc; - for (tc = 0; tc < hdev->tm_info.num_tc; tc++) - if (hdev->tm_info.tc_info[tc].up == pri_id) - break; + tc = hdev->tm_info.prio_tc[pri_id]; if (tc >= hdev->tm_info.num_tc) return -EINVAL; @@ -158,7 +156,7 @@ static int hclge_up_to_tc_map(struct hclge_dev *hdev) hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_PRI_TO_TC_MAPPING, false); - for (pri_id = 0; pri_id < hdev->tm_info.num_tc; pri_id++) { + for (pri_id = 0; pri_id < HNAE3_MAX_USER_PRIO; pri_id++) { ret = hclge_fill_pri_array(hdev, pri, pri_id); if (ret) return ret; @@ -405,16 +403,17 @@ static void hclge_tm_vport_tc_info_update(struct hclge_vport *vport) kinfo->tc_info[i].tqp_offset = i * kinfo->rss_size; kinfo->tc_info[i].tqp_count = kinfo->rss_size; kinfo->tc_info[i].tc = i; - kinfo->tc_info[i].up = hdev->tm_info.tc_info[i].up; } else { /* Set to default queue if TC is disable */ kinfo->tc_info[i].enable = false; kinfo->tc_info[i].tqp_offset = 0; kinfo->tc_info[i].tqp_count = 1; kinfo->tc_info[i].tc = 0; - kinfo->tc_info[i].up = 0; } } + + memcpy(kinfo->prio_tc, hdev->tm_info.prio_tc, + FIELD_SIZEOF(struct hnae3_knic_private_info, prio_tc)); } static void hclge_tm_vport_info_update(struct hclge_dev *hdev) @@ -436,12 +435,15 @@ static void hclge_tm_tc_info_init(struct hclge_dev *hdev) for (i = 0; i < hdev->tm_info.num_tc; i++) { hdev->tm_info.tc_info[i].tc_id = i; hdev->tm_info.tc_info[i].tc_sch_mode = HCLGE_SCH_MODE_DWRR; - hdev->tm_info.tc_info[i].up = i; hdev->tm_info.tc_info[i].pgid = 0; hdev->tm_info.tc_info[i].bw_limit = hdev->tm_info.pg_info[0].bw_limit; } + for (i = 0; i < HNAE3_MAX_USER_PRIO; i++) + hdev->tm_info.prio_tc[i] = + (i >= hdev->tm_info.num_tc) ? 0 : i; + hdev->flag &= ~HCLGE_FLAG_DCB_ENABLE; } -- cgit From c8e1812960eeae42e2183154927028511c4bc566 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 20 Sep 2017 15:45:36 +0300 Subject: net_sched: always reset qdisc backlog in qdisc_reset() SKB stored in qdisc->gso_skb also counted into backlog. Some qdiscs don't reset backlog to zero in ->reset(), for example sfq just dequeue and free all queued skb. Signed-off-by: Konstantin Khlebnikov Fixes: 2ccccf5fb43f ("net_sched: update hierarchical backlog too") Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 92237e75dbbc..bf8c81e07c70 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -685,6 +685,7 @@ void qdisc_reset(struct Qdisc *qdisc) qdisc->gso_skb = NULL; } qdisc->q.qlen = 0; + qdisc->qstats.backlog = 0; } EXPORT_SYMBOL(qdisc_reset); -- cgit From 21f4d5cc25ec0e6e8eb8420dd2c399e6d2fc7d14 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 20 Sep 2017 15:46:11 +0300 Subject: net_sched/hfsc: fix curve activation in hfsc_change_class() If real-time or fair-share curves are enabled in hfsc_change_class() class isn't inserted into rb-trees yet. Thus init_ed() and init_vf() must be called in place of update_ed() and update_vf(). Remove isn't required because for now curves cannot be disabled. Signed-off-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- net/sched/sch_hfsc.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index daaf214e5201..3f88b75488b0 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -958,6 +958,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } if (cl != NULL) { + int old_flags; + if (parentid) { if (cl->cl_parent && cl->cl_parent->cl_common.classid != parentid) @@ -978,6 +980,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } sch_tree_lock(sch); + old_flags = cl->cl_flags; + if (rsc != NULL) hfsc_change_rsc(cl, rsc, cur_time); if (fsc != NULL) @@ -986,10 +990,21 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, hfsc_change_usc(cl, usc, cur_time); if (cl->qdisc->q.qlen != 0) { - if (cl->cl_flags & HFSC_RSC) - update_ed(cl, qdisc_peek_len(cl->qdisc)); - if (cl->cl_flags & HFSC_FSC) - update_vf(cl, 0, cur_time); + int len = qdisc_peek_len(cl->qdisc); + + if (cl->cl_flags & HFSC_RSC) { + if (old_flags & HFSC_RSC) + update_ed(cl, len); + else + init_ed(cl, len); + } + + if (cl->cl_flags & HFSC_FSC) { + if (old_flags & HFSC_FSC) + update_vf(cl, 0, cur_time); + else + init_vf(cl, len); + } } sch_tree_unlock(sch); -- cgit From fe2502e49b58606580c77b3d84e42f946de182d8 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 20 Sep 2017 09:18:45 -0700 Subject: net_sched: remove cls_flower idr on failure Fixes: c15ab236d69d ("net/sched: Change cls_flower to use IDR") Cc: Chris Mi Cc: Jiri Pirko Signed-off-by: Cong Wang Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 1a267e77c6de..d230cb4c8094 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -922,28 +922,28 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (!tc_flags_valid(fnew->flags)) { err = -EINVAL; - goto errout; + goto errout_idr; } } err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); if (err) - goto errout; + goto errout_idr; err = fl_check_assign_mask(head, &mask); if (err) - goto errout; + goto errout_idr; if (!tc_skip_sw(fnew->flags)) { if (!fold && fl_lookup(head, &fnew->mkey)) { err = -EEXIST; - goto errout; + goto errout_idr; } err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, head->ht_params); if (err) - goto errout; + goto errout_idr; } if (!tc_skip_hw(fnew->flags)) { @@ -952,7 +952,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, &mask.key, fnew); if (err) - goto errout; + goto errout_idr; } if (!tc_in_hw(fnew->flags)) @@ -981,6 +981,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, kfree(tb); return 0; +errout_idr: + if (fnew->handle) + idr_remove_ext(&head->handle_idr, fnew->handle); errout: tcf_exts_destroy(&fnew->exts); kfree(fnew); -- cgit From 0ab09befdbb7ca9b969d6206108629ddff43876e Mon Sep 17 00:00:00 2001 From: Alex Ng Date: Wed, 20 Sep 2017 11:17:35 -0700 Subject: hv_netvsc: fix send buffer failure on MTU change MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If MTU is changed the host would reject the send buffer change. This problem is result of recent change to allow changing send buffer size. Every time we change the MTU, we store the previous net_device section count before destroying the buffer, but we don’t store the previous section size. When we reinitialize the buffer, its size is calculated by multiplying the previous count and previous size. Since we continuously increase the MTU, the host returns us a decreasing count value while the section size is reinitialized to 1728 bytes every time. This eventually leads to a condition where the calculated buf_size is so small that the host rejects it. Fixes: 8b5327975ae1 ("netvsc: allow controlling send/recv buffer size") Signed-off-by: Alex Ng Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/hyperv_net.h | 2 ++ drivers/net/hyperv/netvsc.c | 7 ++----- drivers/net/hyperv/netvsc_drv.c | 8 ++++++++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index d98cdfb1536b..5176be76ca7d 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -150,6 +150,8 @@ struct netvsc_device_info { u32 num_chn; u32 send_sections; u32 recv_sections; + u32 send_section_size; + u32 recv_section_size; }; enum rndis_device_state { diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index a5511b7326af..8d5077fb0492 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -76,9 +76,6 @@ static struct netvsc_device *alloc_net_device(void) net_device->max_pkt = RNDIS_MAX_PKT_DEFAULT; net_device->pkt_align = RNDIS_PKT_ALIGN_DEFAULT; - net_device->recv_section_size = NETVSC_RECV_SECTION_SIZE; - net_device->send_section_size = NETVSC_SEND_SECTION_SIZE; - init_completion(&net_device->channel_init_wait); init_waitqueue_head(&net_device->subchan_open); INIT_WORK(&net_device->subchan_work, rndis_set_subchannel); @@ -262,7 +259,7 @@ static int netvsc_init_buf(struct hv_device *device, int ret = 0; /* Get receive buffer area. */ - buf_size = device_info->recv_sections * net_device->recv_section_size; + buf_size = device_info->recv_sections * device_info->recv_section_size; buf_size = roundup(buf_size, PAGE_SIZE); net_device->recv_buf = vzalloc(buf_size); @@ -344,7 +341,7 @@ static int netvsc_init_buf(struct hv_device *device, goto cleanup; /* Now setup the send buffer. */ - buf_size = device_info->send_sections * net_device->send_section_size; + buf_size = device_info->send_sections * device_info->send_section_size; buf_size = round_up(buf_size, PAGE_SIZE); net_device->send_buf = vzalloc(buf_size); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index d4902ee5f260..a32ae02e1b6c 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -848,7 +848,9 @@ static int netvsc_set_channels(struct net_device *net, device_info.num_chn = count; device_info.ring_size = ring_size; device_info.send_sections = nvdev->send_section_cnt; + device_info.send_section_size = nvdev->send_section_size; device_info.recv_sections = nvdev->recv_section_cnt; + device_info.recv_section_size = nvdev->recv_section_size; rndis_filter_device_remove(dev, nvdev); @@ -963,7 +965,9 @@ static int netvsc_change_mtu(struct net_device *ndev, int mtu) device_info.ring_size = ring_size; device_info.num_chn = nvdev->num_chn; device_info.send_sections = nvdev->send_section_cnt; + device_info.send_section_size = nvdev->send_section_size; device_info.recv_sections = nvdev->recv_section_cnt; + device_info.recv_section_size = nvdev->recv_section_size; rndis_filter_device_remove(hdev, nvdev); @@ -1485,7 +1489,9 @@ static int netvsc_set_ringparam(struct net_device *ndev, device_info.num_chn = nvdev->num_chn; device_info.ring_size = ring_size; device_info.send_sections = new_tx; + device_info.send_section_size = nvdev->send_section_size; device_info.recv_sections = new_rx; + device_info.recv_section_size = nvdev->recv_section_size; netif_device_detach(ndev); was_opened = rndis_filter_opened(nvdev); @@ -1934,7 +1940,9 @@ static int netvsc_probe(struct hv_device *dev, device_info.ring_size = ring_size; device_info.num_chn = VRSS_CHANNEL_DEFAULT; device_info.send_sections = NETVSC_DEFAULT_TX; + device_info.send_section_size = NETVSC_SEND_SECTION_SIZE; device_info.recv_sections = NETVSC_DEFAULT_RX; + device_info.recv_section_size = NETVSC_RECV_SECTION_SIZE; nvdev = rndis_filter_device_add(dev, &device_info); if (IS_ERR(nvdev)) { -- cgit From 4a7a3860caac1a8779e8c459d8abe21b111798d6 Mon Sep 17 00:00:00 2001 From: Timur Tabi Date: Wed, 20 Sep 2017 15:32:53 -0500 Subject: net: qcom/emac: add software control for pause frame mode The EMAC has the option of sending only a single pause frame when flow control is enabled and the RX queue is full. Although sending only one pause frame has little value, this would allow admins to enable automatic flow control without having to worry about the EMAC flooding nearby switches with pause frames if the kernel hangs. The option is enabled by using the single-pause-mode private flag. Signed-off-by: Timur Tabi Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/emac/emac-ethtool.c | 30 +++++++++++++++++++++++ drivers/net/ethernet/qualcomm/emac/emac-mac.c | 22 +++++++++++++++++ drivers/net/ethernet/qualcomm/emac/emac.c | 3 +++ drivers/net/ethernet/qualcomm/emac/emac.h | 3 +++ 4 files changed, 58 insertions(+) diff --git a/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c b/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c index bbe24639aa5a..c8c6231b87f3 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c @@ -88,6 +88,8 @@ static void emac_set_msglevel(struct net_device *netdev, u32 data) static int emac_get_sset_count(struct net_device *netdev, int sset) { switch (sset) { + case ETH_SS_PRIV_FLAGS: + return 1; case ETH_SS_STATS: return EMAC_STATS_LEN; default: @@ -100,6 +102,10 @@ static void emac_get_strings(struct net_device *netdev, u32 stringset, u8 *data) unsigned int i; switch (stringset) { + case ETH_SS_PRIV_FLAGS: + strcpy(data, "single-pause-mode"); + break; + case ETH_SS_STATS: for (i = 0; i < EMAC_STATS_LEN; i++) { strlcpy(data, emac_ethtool_stat_strings[i], @@ -230,6 +236,27 @@ static int emac_get_regs_len(struct net_device *netdev) return EMAC_MAX_REG_SIZE * sizeof(u32); } +#define EMAC_PRIV_ENABLE_SINGLE_PAUSE BIT(0) + +static int emac_set_priv_flags(struct net_device *netdev, u32 flags) +{ + struct emac_adapter *adpt = netdev_priv(netdev); + + adpt->single_pause_mode = !!(flags & EMAC_PRIV_ENABLE_SINGLE_PAUSE); + + if (netif_running(netdev)) + return emac_reinit_locked(adpt); + + return 0; +} + +static u32 emac_get_priv_flags(struct net_device *netdev) +{ + struct emac_adapter *adpt = netdev_priv(netdev); + + return adpt->single_pause_mode ? EMAC_PRIV_ENABLE_SINGLE_PAUSE : 0; +} + static const struct ethtool_ops emac_ethtool_ops = { .get_link_ksettings = phy_ethtool_get_link_ksettings, .set_link_ksettings = phy_ethtool_set_link_ksettings, @@ -253,6 +280,9 @@ static const struct ethtool_ops emac_ethtool_ops = { .get_regs_len = emac_get_regs_len, .get_regs = emac_get_regs, + + .set_priv_flags = emac_set_priv_flags, + .get_priv_flags = emac_get_priv_flags, }; void emac_set_ethtool_ops(struct net_device *netdev) diff --git a/drivers/net/ethernet/qualcomm/emac/emac-mac.c b/drivers/net/ethernet/qualcomm/emac/emac-mac.c index bcd4708b3745..0ea3ca09c689 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-mac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-mac.c @@ -551,6 +551,28 @@ static void emac_mac_start(struct emac_adapter *adpt) mac &= ~(HUGEN | VLAN_STRIP | TPAUSE | SIMR | HUGE | MULTI_ALL | DEBUG_MODE | SINGLE_PAUSE_MODE); + /* Enable single-pause-frame mode if requested. + * + * If enabled, the EMAC will send a single pause frame when the RX + * queue is full. This normally leads to packet loss because + * the pause frame disables the remote MAC only for 33ms (the quanta), + * and then the remote MAC continues sending packets even though + * the RX queue is still full. + * + * If disabled, the EMAC sends a pause frame every 31ms until the RX + * queue is no longer full. Normally, this is the preferred + * method of operation. However, when the system is hung (e.g. + * cores are halted), the EMAC interrupt handler is never called + * and so the RX queue fills up quickly and stays full. The resuling + * non-stop "flood" of pause frames sometimes has the effect of + * disabling nearby switches. In some cases, other nearby switches + * are also affected, shutting down the entire network. + * + * The user can enable or disable single-pause-frame mode + * via ethtool. + */ + mac |= adpt->single_pause_mode ? SINGLE_PAUSE_MODE : 0; + writel_relaxed(csr1, adpt->csr + EMAC_EMAC_WRAPPER_CSR1); writel_relaxed(mac, adpt->base + EMAC_MAC_CTRL); diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c index 60850bfa3d32..759543512117 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac.c @@ -443,6 +443,9 @@ static void emac_init_adapter(struct emac_adapter *adpt) /* default to automatic flow control */ adpt->automatic = true; + + /* Disable single-pause-frame mode by default */ + adpt->single_pause_mode = false; } /* Get the clock */ diff --git a/drivers/net/ethernet/qualcomm/emac/emac.h b/drivers/net/ethernet/qualcomm/emac/emac.h index 8ee4ec6aef2e..d7c9f44209d4 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac.h +++ b/drivers/net/ethernet/qualcomm/emac/emac.h @@ -363,6 +363,9 @@ struct emac_adapter { bool tx_flow_control; bool rx_flow_control; + /* True == use single-pause-frame mode. */ + bool single_pause_mode; + /* Ring parameter */ u8 tpd_burst; u8 rfd_burst; -- cgit From 19cab8872692960535aa6d12e3a295ac51d1a648 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 20 Sep 2017 15:52:13 -0700 Subject: net: ethtool: Add back transceiver type Commit 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API") deprecated the ethtool_cmd::transceiver field, which was fine in premise, except that the PHY library was actually using it to report the type of transceiver: internal or external. Use the first word of the reserved field to put this __u8 transceiver field back in. It is made read-only, and we don't expect the ETHTOOL_xLINKSETTINGS API to be doing anything with this anyway, so this is mostly for the legacy path where we do: ethtool_get_settings() -> dev->ethtool_ops->get_link_ksettings() -> convert_link_ksettings_to_legacy_settings() to have no information loss compared to the legacy get_settings API. Fixes: 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 6 +++++- net/core/ethtool.c | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9c041dae8e2c..5bd1b1de4ea0 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1753,6 +1753,8 @@ enum ethtool_reset_flags { * %ethtool_link_mode_bit_indices for the link modes, and other * link features that the link partner advertised through * autonegotiation; 0 if unknown or not applicable. Read-only. + * @transceiver: Used to distinguish different possible PHY types, + * reported consistently by PHYLIB. Read-only. * * If autonegotiation is disabled, the speed and @duplex represent the * fixed link mode and are writable if the driver supports multiple @@ -1804,7 +1806,9 @@ struct ethtool_link_settings { __u8 eth_tp_mdix; __u8 eth_tp_mdix_ctrl; __s8 link_mode_masks_nwords; - __u32 reserved[8]; + __u8 transceiver; + __u8 reserved1[3]; + __u32 reserved[7]; __u32 link_mode_masks[0]; /* layout of link_mode_masks fields: * __u32 map_supported[link_mode_masks_nwords]; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 6a582ae4c5d9..3228411ada0f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -525,6 +525,8 @@ convert_link_ksettings_to_legacy_settings( = link_ksettings->base.eth_tp_mdix; legacy_settings->eth_tp_mdix_ctrl = link_ksettings->base.eth_tp_mdix_ctrl; + legacy_settings->transceiver + = link_ksettings->base.transceiver; return retval; } -- cgit From ceb628134a75564d7bfa8e4ef902e6e588339e11 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 20 Sep 2017 15:52:14 -0700 Subject: net: phy: Keep reporting transceiver type With commit 2d55173e71b0 ("phy: add generic function to support ksetting support"), we lost the ability to report the transceiver type like we used to. Now that we have added back the transceiver type to ethtool_link_settings, we can report it back like we used to and have no loss of information. Fixes: 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API") Fixes: 2d55173e71b0 ("phy: add generic function to support ksetting support") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index e842d2cd1ee7..2b1e67bc1e73 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -373,7 +373,8 @@ void phy_ethtool_ksettings_get(struct phy_device *phydev, cmd->base.port = PORT_BNC; else cmd->base.port = PORT_MII; - + cmd->base.transceiver = phy_is_internal(phydev) ? + XCVR_INTERNAL : XCVR_EXTERNAL; cmd->base.phy_address = phydev->mdio.addr; cmd->base.autoneg = phydev->autoneg; cmd->base.eth_tp_mdix_ctrl = phydev->mdix_ctrl; -- cgit From 8a7ffeb795f864dd605b579c05934cba95dc8ad3 Mon Sep 17 00:00:00 2001 From: Nisar Sayed Date: Thu, 21 Sep 2017 02:36:36 +0530 Subject: lan78xx: Fix for eeprom read/write when device auto suspend Fix for eeprom read/write when device auto suspend Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Signed-off-by: Nisar Sayed Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index b99a7fb09f8e..fcf85ae37435 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1265,30 +1265,46 @@ static int lan78xx_ethtool_get_eeprom(struct net_device *netdev, struct ethtool_eeprom *ee, u8 *data) { struct lan78xx_net *dev = netdev_priv(netdev); + int ret; + + ret = usb_autopm_get_interface(dev->intf); + if (ret) + return ret; ee->magic = LAN78XX_EEPROM_MAGIC; - return lan78xx_read_raw_eeprom(dev, ee->offset, ee->len, data); + ret = lan78xx_read_raw_eeprom(dev, ee->offset, ee->len, data); + + usb_autopm_put_interface(dev->intf); + + return ret; } static int lan78xx_ethtool_set_eeprom(struct net_device *netdev, struct ethtool_eeprom *ee, u8 *data) { struct lan78xx_net *dev = netdev_priv(netdev); + int ret; + + ret = usb_autopm_get_interface(dev->intf); + if (ret) + return ret; /* Allow entire eeprom update only */ if ((ee->magic == LAN78XX_EEPROM_MAGIC) && (ee->offset == 0) && (ee->len == 512) && (data[0] == EEPROM_INDICATOR)) - return lan78xx_write_raw_eeprom(dev, ee->offset, ee->len, data); + ret = lan78xx_write_raw_eeprom(dev, ee->offset, ee->len, data); else if ((ee->magic == LAN78XX_OTP_MAGIC) && (ee->offset == 0) && (ee->len == 512) && (data[0] == OTP_INDICATOR_1)) - return lan78xx_write_raw_otp(dev, ee->offset, ee->len, data); + ret = lan78xx_write_raw_otp(dev, ee->offset, ee->len, data); - return -EINVAL; + usb_autopm_put_interface(dev->intf); + + return ret; } static void lan78xx_get_strings(struct net_device *netdev, u32 stringset, -- cgit From c077682282dd34c75e8477c22dffe7c9aebc6e98 Mon Sep 17 00:00:00 2001 From: Nisar Sayed Date: Thu, 21 Sep 2017 02:36:37 +0530 Subject: lan78xx: Allow EEPROM write for less than MAX_EEPROM_SIZE Allow EEPROM write for less than MAX_EEPROM_SIZE Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Signed-off-by: Nisar Sayed Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index fcf85ae37435..f8c63eec8353 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1290,11 +1290,10 @@ static int lan78xx_ethtool_set_eeprom(struct net_device *netdev, if (ret) return ret; - /* Allow entire eeprom update only */ - if ((ee->magic == LAN78XX_EEPROM_MAGIC) && - (ee->offset == 0) && - (ee->len == 512) && - (data[0] == EEPROM_INDICATOR)) + /* Invalid EEPROM_INDICATOR at offset zero will result in a failure + * to load data from EEPROM + */ + if (ee->magic == LAN78XX_EEPROM_MAGIC) ret = lan78xx_write_raw_eeprom(dev, ee->offset, ee->len, data); else if ((ee->magic == LAN78XX_OTP_MAGIC) && (ee->offset == 0) && -- cgit From e365280521029c9366bab038915274ddaa1b7195 Mon Sep 17 00:00:00 2001 From: Nisar Sayed Date: Thu, 21 Sep 2017 02:36:38 +0530 Subject: lan78xx: Use default values loaded from EEPROM/OTP after reset Use default value of auto duplex and auto speed values loaded from EEPROM/OTP after reset. The LAN78xx allows platform configurations to be loaded from EEPROM/OTP. Ex: When external phy is connected, the MAC can be configured to have correct auto speed, auto duplex, auto polarity configured from the EEPROM/OTP. Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Signed-off-by: Nisar Sayed Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index f8c63eec8353..0161f77641fa 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2449,7 +2449,6 @@ static int lan78xx_reset(struct lan78xx_net *dev) /* LAN7801 only has RGMII mode */ if (dev->chipid == ID_REV_CHIP_ID_7801_) buf &= ~MAC_CR_GMII_EN_; - buf |= MAC_CR_AUTO_DUPLEX_ | MAC_CR_AUTO_SPEED_; ret = lan78xx_write_reg(dev, MAC_CR, buf); ret = lan78xx_read_reg(dev, MAC_TX, &buf); -- cgit From f0ef1f4f2b772c0a1c8b35a6ae3edf974cc110dd Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Thu, 21 Sep 2017 08:24:27 +0200 Subject: net: stmmac: Cocci spatch "of_table" Make sure (of/i2c/platform)_device_id tables are NULL terminated. Found by coccinelle spatch "misc/of_table.cocci" Signed-off-by: Thomas Meyer Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index a366b3747eeb..8a280b48e3a9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -315,6 +315,7 @@ static int stmmac_dt_phy(struct plat_stmmacenet_data *plat, { .compatible = "allwinner,sun8i-h3-emac" }, { .compatible = "allwinner,sun8i-v3s-emac" }, { .compatible = "allwinner,sun50i-a64-emac" }, + {}, }; /* If phy-handle property is passed from DT, use it as the PHY */ -- cgit From 09579ac803a3638344b8544b5940793d5358673e Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Thu, 21 Sep 2017 09:16:26 +0200 Subject: net/smc: add missing dev_put In the infiniband part, SMC currently uses get_netdev which calls dev_hold on the returned net device. However, the SMC code never calls dev_put on that net device resulting in a wrong reference count. This patch adds a dev_put after the usage of the net device to fix the issue. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_ib.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 547e0e113b17..0b5852299158 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -380,6 +380,7 @@ static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport); if (ndev) { memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN); + dev_put(ndev); } else if (!rc) { memcpy(&smcibdev->mac[ibport - 1][0], &smcibdev->gid[ibport - 1].raw[8], 3); -- cgit From 846e344eb7229018457d6d6fc1ab0cc0a167692f Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Thu, 21 Sep 2017 09:16:27 +0200 Subject: net/smc: add receive timeout check The SMC receive function currently lacks a timeout check under the condition that no data were received and no data are available. This patch adds such a check. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_rx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index b17a333e9bb0..3e631ae4b6b6 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -148,6 +148,8 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, read_done = sock_intr_errno(timeo); break; } + if (!timeo) + return -EAGAIN; } if (!atomic_read(&conn->bytes_to_rcv)) { -- cgit From 731b008560e6dfaf5fb297543f17bbe9bb868f3c Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:28 +0200 Subject: net/smc: take RCU read lock for routing cache lookup smc_netinfo_by_tcpsk() looks up the routing cache. Such a lookup requires protection by an RCU read lock. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 8c6d24b2995d..2e8d2dabac0c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -282,6 +282,7 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet, u8 *prefix_len) { struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct in_device *in_dev; struct sockaddr_in addr; int rc = -ENOENT; int len; @@ -298,14 +299,17 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock, /* get address to which the internal TCP socket is bound */ kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len); /* analyze IPv4 specific data of net_device belonging to TCP socket */ - for_ifa(dst->dev->ip_ptr) { - if (ifa->ifa_address != addr.sin_addr.s_addr) + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dst->dev); + for_ifa(in_dev) { + if (!inet_ifa_match(addr.sin_addr.s_addr, ifa)) continue; *prefix_len = inet_mask_len(ifa->ifa_mask); *subnet = ifa->ifa_address & ifa->ifa_mask; rc = 0; break; - } endfor_ifa(dst->dev->ip_ptr); + } endfor_ifa(in_dev); + rcu_read_unlock(); out_rel: dst_release(dst); -- cgit From a6832c3acdb2ceb099ec3c385777fbaa6d5a5fd6 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:29 +0200 Subject: net/smc: adjust net_device refcount smc_pnet_fill_entry() uses dev_get_by_name() adding a refcount to ndev. The following smc_pnet_enter() has to reduce the refcount if the entry to be added exists already in the pnet table. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_pnet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 78f7af28ae4f..31f8453c25c5 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -181,8 +181,10 @@ static int smc_pnet_enter(struct smc_pnetentry *new_pnetelem) sizeof(new_pnetelem->ndev->name)) || smc_pnet_same_ibname(pnetelem, new_pnetelem->smcibdev->ibdev->name, - new_pnetelem->ib_port)) + new_pnetelem->ib_port)) { + dev_put(pnetelem->ndev); goto found; + } } list_add_tail(&new_pnetelem->list, &smc_pnettable.pnetlist); rc = 0; -- cgit From 8301fa44b41b1ca46a0547809eb173112559dc51 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:30 +0200 Subject: net/smc: adapt send request completion notification The solicited flag is meaningful for the receive completion queue. Ask for next work completion of any type on the send queue. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_wr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index ab56bda66783..525d91e0d57e 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -244,7 +244,7 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) int rc; ib_req_notify_cq(link->smcibdev->roce_cq_send, - IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS); + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], &failed_wr); -- cgit From 5bc11ddbdf7fc6681db5c3f9a92cdee0f19cee1e Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:31 +0200 Subject: net/smc: longer delay for client link group removal Client link group creation always follows the server linkgroup creation. If peer creates a new server link group, client has to create a new client link group. If peer reuses a server link group for a new connection, client has to reuse its client link group as well. This patch introduces a longer delay for client link group removal to make sure this link group still exists, once the peer decides to reuse a server link group. This avoids out-of-sync conditions for link groups. If already scheduled, modify the delay. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 1a16d51e2330..20b66e79c5d6 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -25,8 +25,9 @@ #include "smc_cdc.h" #include "smc_close.h" -#define SMC_LGR_NUM_INCR 256 -#define SMC_LGR_FREE_DELAY (600 * HZ) +#define SMC_LGR_NUM_INCR 256 +#define SMC_LGR_FREE_DELAY_SERV (600 * HZ) +#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10) static u32 smc_lgr_num; /* unique link group number */ @@ -107,8 +108,15 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); - if (reduced && !lgr->conns_num) - schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY); + if (!reduced || lgr->conns_num) + return; + /* client link group creation always follows the server link group + * creation. For client use a somewhat higher removal delay time, + * otherwise there is a risk of out-of-sync link groups. + */ + mod_delayed_work(system_wq, &lgr->free_work, + lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT : + SMC_LGR_FREE_DELAY_SERV); } static void smc_lgr_free_work(struct work_struct *work) -- cgit From bfbedfd38378c1ad9df84469403d69c17b074b66 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:32 +0200 Subject: net/smc: terminate link group if out-of-sync is received An out-of-sync condition can just be detected by the client. If the server receives a CLC DECLINE message indicating an out-of-sync condition for the link groups, the server must clean up the out-of-sync link group. There is no need for an extra third parameter in smc_clc_send_decline(). Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 6 ++---- net/smc/smc_clc.c | 10 +++++----- net/smc/smc_clc.h | 3 +-- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 2e8d2dabac0c..745f145d4c4d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -513,7 +513,7 @@ decline_rdma: /* RDMA setup failed, switch back to TCP */ smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(smc, reason_code, 0); + rc = smc_clc_send_decline(smc, reason_code); if (rc < sizeof(struct smc_clc_msg_decline)) goto out_err; } @@ -808,8 +808,6 @@ static void smc_listen_work(struct work_struct *work) rc = local_contact; if (rc == -ENOMEM) reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - else if (rc == -ENOLINK) - reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ goto decline_rdma; } link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; @@ -903,7 +901,7 @@ decline_rdma: smc_conn_free(&new_smc->conn); new_smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(new_smc, reason_code, 0); + rc = smc_clc_send_decline(new_smc, reason_code); if (rc < sizeof(struct smc_clc_msg_decline)) goto out_err; } diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 3934913ab835..b7dd2743fb5c 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -95,9 +95,10 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } if (clcm->type == SMC_CLC_DECLINE) { reason_code = SMC_CLC_DECL_REPLY; - if (ntohl(((struct smc_clc_msg_decline *)buf)->peer_diagnosis) - == SMC_CLC_DECL_SYNCERR) + if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { smc->conn.lgr->sync_err = true; + smc_lgr_terminate(smc->conn.lgr); + } } out: @@ -105,8 +106,7 @@ out: } /* send CLC DECLINE message across internal TCP socket */ -int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, - u8 out_of_sync) +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) { struct smc_clc_msg_decline dclc; struct msghdr msg; @@ -118,7 +118,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, dclc.hdr.type = SMC_CLC_DECLINE; dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); dclc.hdr.version = SMC_CLC_V1; - dclc.hdr.flag = out_of_sync ? 1 : 0; + dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0; memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); dclc.peer_diagnosis = htonl(peer_diag_info); memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 13db8ce177c9..1c55414041d4 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -106,8 +106,7 @@ struct smc_ib_device; int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type); -int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, - u8 out_of_sync); +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport); int smc_clc_send_confirm(struct smc_sock *smc); -- cgit From 18e537cd58e8d6932719bfa79cb96a1fbc639199 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:33 +0200 Subject: net/smc: introduce a delay The number of outstanding work requests is limited. If all work requests are in use, tx processing is postponed to another scheduling of the tx worker. Switch to a delayed worker to have a gap for tx completion queue events before the next retry. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc.h | 2 +- net/smc/smc_close.c | 12 +++++++----- net/smc/smc_tx.c | 12 ++++++++---- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index 6e44313e4467..0ccd6fa387ad 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -149,7 +149,7 @@ struct smc_connection { atomic_t sndbuf_space; /* remaining space in sndbuf */ u16 tx_cdc_seq; /* sequence # for CDC send */ spinlock_t send_lock; /* protect wr_sends */ - struct work_struct tx_work; /* retry of smc_cdc_msg_send */ + struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. * .prod cf. TCP rcv_nxt diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 3c2e166b5d22..5201bc103bd8 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -208,7 +208,7 @@ again: case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_state == SMC_ACTIVE) { /* send close request */ @@ -234,7 +234,7 @@ again: if (!smc_cdc_rxed_any_close(conn)) smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_err != ECONNABORTED) { /* confirm close from peer */ @@ -263,7 +263,9 @@ again: /* peer sending PeerConnectionClosed will cause transition */ break; case SMC_PROCESSABORT: - cancel_work_sync(&conn->tx_work); + release_sock(sk); + cancel_delayed_work_sync(&conn->tx_work); + lock_sock(sk); smc_close_abort(conn); sk->sk_state = SMC_CLOSED; smc_close_wait_tx_pends(smc); @@ -425,7 +427,7 @@ again: case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); /* send close wr request */ rc = smc_close_wr(conn); @@ -439,7 +441,7 @@ again: if (!smc_cdc_rxed_any_close(conn)) smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); /* confirm close from peer */ rc = smc_close_wr(conn); diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 3c656beb8820..3866573288dd 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -24,6 +24,8 @@ #include "smc_cdc.h" #include "smc_tx.h" +#define SMC_TX_WORK_DELAY HZ + /***************************** sndbuf producer *******************************/ /* callback implementation for sk.sk_write_space() @@ -406,7 +408,8 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) goto out_unlock; } rc = 0; - schedule_work(&conn->tx_work); + schedule_delayed_work(&conn->tx_work, + SMC_TX_WORK_DELAY); } goto out_unlock; } @@ -430,7 +433,7 @@ out_unlock: */ static void smc_tx_work(struct work_struct *work) { - struct smc_connection *conn = container_of(work, + struct smc_connection *conn = container_of(to_delayed_work(work), struct smc_connection, tx_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); @@ -468,7 +471,8 @@ void smc_tx_consumer_update(struct smc_connection *conn) if (!rc) rc = smc_cdc_msg_send(conn, wr_buf, pend); if (rc < 0) { - schedule_work(&conn->tx_work); + schedule_delayed_work(&conn->tx_work, + SMC_TX_WORK_DELAY); return; } smc_curs_write(&conn->rx_curs_confirmed, @@ -487,6 +491,6 @@ void smc_tx_consumer_update(struct smc_connection *conn) void smc_tx_init(struct smc_sock *smc) { smc->sk.sk_write_space = smc_tx_write_space; - INIT_WORK(&smc->conn.tx_work, smc_tx_work); + INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); spin_lock_init(&smc->conn.send_lock); } -- cgit From 8c96feeeb39ba0b89c6121f71e8f7aa2a4d46671 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:34 +0200 Subject: net/smc: no close wait in case of process shut down Usually socket closing is delayed if there is still data available in the send buffer to be transmitted. If a process is killed, the delay should be avoided. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_close.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 5201bc103bd8..f0d16fb825f7 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -174,15 +174,15 @@ int smc_close_active(struct smc_sock *smc) { struct smc_cdc_conn_state_flags *txflags = &smc->conn.local_tx_ctrl.conn_state_flags; - long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT; struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; int old_state; + long timeout; int rc = 0; - if (sock_flag(sk, SOCK_LINGER) && - !(current->flags & PF_EXITING)) - timeout = sk->sk_lingertime; + timeout = current->flags & PF_EXITING ? + 0 : sock_flag(sk, SOCK_LINGER) ? + sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; again: old_state = sk->sk_state; @@ -413,13 +413,14 @@ void smc_close_sock_put_work(struct work_struct *work) int smc_close_shutdown_write(struct smc_sock *smc) { struct smc_connection *conn = &smc->conn; - long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT; struct sock *sk = &smc->sk; int old_state; + long timeout; int rc = 0; - if (sock_flag(sk, SOCK_LINGER)) - timeout = sk->sk_lingertime; + timeout = current->flags & PF_EXITING ? + 0 : sock_flag(sk, SOCK_LINGER) ? + sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; again: old_state = sk->sk_state; -- cgit From 059fbe8b5171960bd5c2371bb327ac18733773de Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 21 Sep 2017 13:27:02 +0200 Subject: net: phy: Fix truncation of large IRQ numbers in phy_attached_print() Given NR_IRQS is 2048 on sparc64, and even 32784 on alpha, 3 digits is not enough to represent interrupt numbers on all architectures. Hence PHY interrupt numbers may be truncated during printing. Increase the buffer size from 4 to 8 bytes to fix this. Fixes: 5e369aefdce4818c ("net: stmmac: Delete dead code for MDIO registration") Signed-off-by: Geert Uytterhoeven Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 8cf0c5901f95..67f25ac29025 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -879,7 +879,7 @@ void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) { const char *drv_name = phydev->drv ? phydev->drv->name : "unbound"; char *irq_str; - char irq_num[4]; + char irq_num[8]; switch(phydev->irq) { case PHY_POLL: -- cgit From 222d7dbd258dad4cd5241c43ef818141fad5a87a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Sep 2017 09:15:46 -0700 Subject: net: prevent dst uses after free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In linux-4.13, Wei worked hard to convert dst to a traditional refcounted model, removing GC. We now want to make sure a dst refcount can not transition from 0 back to 1. The problem here is that input path attached a not refcounted dst to an skb. Then later, because packet is forwarded and hits skb_dst_force() before exiting RCU section, we might try to take a refcount on one dst that is about to be freed, if another cpu saw 1 -> 0 transition in dst_release() and queued the dst for freeing after one RCU grace period. Lets unify skb_dst_force() and skb_dst_force_safe(), since we should always perform the complete check against dst refcount, and not assume it is not zero. Bugzilla : https://bugzilla.kernel.org/show_bug.cgi?id=197005 [ 989.919496] skb_dst_force+0x32/0x34 [ 989.919498] __dev_queue_xmit+0x1ad/0x482 [ 989.919501] ? eth_header+0x28/0xc6 [ 989.919502] dev_queue_xmit+0xb/0xd [ 989.919504] neigh_connected_output+0x9b/0xb4 [ 989.919507] ip_finish_output2+0x234/0x294 [ 989.919509] ? ipt_do_table+0x369/0x388 [ 989.919510] ip_finish_output+0x12c/0x13f [ 989.919512] ip_output+0x53/0x87 [ 989.919513] ip_forward_finish+0x53/0x5a [ 989.919515] ip_forward+0x2cb/0x3e6 [ 989.919516] ? pskb_trim_rcsum.part.9+0x4b/0x4b [ 989.919518] ip_rcv_finish+0x2e2/0x321 [ 989.919519] ip_rcv+0x26f/0x2eb [ 989.919522] ? vlan_do_receive+0x4f/0x289 [ 989.919523] __netif_receive_skb_core+0x467/0x50b [ 989.919526] ? tcp_gro_receive+0x239/0x239 [ 989.919529] ? inet_gro_receive+0x226/0x238 [ 989.919530] __netif_receive_skb+0x4d/0x5f [ 989.919532] netif_receive_skb_internal+0x5c/0xaf [ 989.919533] napi_gro_receive+0x45/0x81 [ 989.919536] ixgbe_poll+0xc8a/0xf09 [ 989.919539] ? kmem_cache_free_bulk+0x1b6/0x1f7 [ 989.919540] net_rx_action+0xf4/0x266 [ 989.919543] __do_softirq+0xa8/0x19d [ 989.919545] irq_exit+0x5d/0x6b [ 989.919546] do_IRQ+0x9c/0xb5 [ 989.919548] common_interrupt+0x93/0x93 [ 989.919548] Similarly dst_clone() can use dst_hold() helper to have additional debugging, as a follow up to commit 44ebe79149ff ("net: add debug atomic_inc_not_zero() in dst_hold()") In net-next we will convert dst atomic_t to refcount_t for peace of mind. Fixes: a4c2fd7f7891 ("net: remove DST_NOCACHE flag") Signed-off-by: Eric Dumazet Cc: Wei Wang Reported-by: Paweł Staszewski Bisected-by: Paweł Staszewski Acked-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/dst.h | 22 ++++------------------ include/net/route.h | 2 +- include/net/sock.h | 2 +- 3 files changed, 6 insertions(+), 20 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 93568bd0a352..06a6765da074 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) static inline struct dst_entry *dst_clone(struct dst_entry *dst) { if (dst) - atomic_inc(&dst->__refcnt); + dst_hold(dst); return dst; } @@ -311,21 +311,6 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb __skb_dst_copy(nskb, oskb->_skb_refdst); } -/** - * skb_dst_force - makes sure skb dst is refcounted - * @skb: buffer - * - * If dst is not yet refcounted, let's do it - */ -static inline void skb_dst_force(struct sk_buff *skb) -{ - if (skb_dst_is_noref(skb)) { - WARN_ON(!rcu_read_lock_held()); - skb->_skb_refdst &= ~SKB_DST_NOREF; - dst_clone(skb_dst(skb)); - } -} - /** * dst_hold_safe - Take a reference on a dst if possible * @dst: pointer to dst entry @@ -339,16 +324,17 @@ static inline bool dst_hold_safe(struct dst_entry *dst) } /** - * skb_dst_force_safe - makes sure skb dst is refcounted + * skb_dst_force - makes sure skb dst is refcounted * @skb: buffer * * If dst is not yet refcounted and not destroyed, grab a ref on it. */ -static inline void skb_dst_force_safe(struct sk_buff *skb) +static inline void skb_dst_force(struct sk_buff *skb) { if (skb_dst_is_noref(skb)) { struct dst_entry *dst = skb_dst(skb); + WARN_ON(!rcu_read_lock_held()); if (!dst_hold_safe(dst)) dst = NULL; diff --git a/include/net/route.h b/include/net/route.h index 1b09a9368c68..57dfc6850d37 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -190,7 +190,7 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, rcu_read_lock(); err = ip_route_input_noref(skb, dst, src, tos, devin); if (!err) { - skb_dst_force_safe(skb); + skb_dst_force(skb); if (!skb_dst(skb)) err = -EINVAL; } diff --git a/include/net/sock.h b/include/net/sock.h index 03a362568357..a6b9a8d1a6df 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -856,7 +856,7 @@ void sk_stream_write_space(struct sock *sk); static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { /* dont let skb dst not refcounted, we are going to leave rcu lock */ - skb_dst_force_safe(skb); + skb_dst_force(skb); if (!sk->sk_backlog.tail) sk->sk_backlog.head = skb; -- cgit From b9b95da92de9d498ece7e6a82e2d6dcfc76fd9d8 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Fri, 22 Sep 2017 14:28:46 +0200 Subject: MAINTAINERS: update git tree locations for ieee802154 subsystem Patches for ieee802154 will go through my new trees towards netdev from now on. The 6LoWPAN subsystem will stay as is (shared between ieee802154 and bluetooth) and go through the bluetooth tree as usual. Signed-off-by: Stefan Schmidt Signed-off-by: David S. Miller --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 955f034fd523..61ee134cf6a8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6642,8 +6642,8 @@ M: Alexander Aring M: Stefan Schmidt L: linux-wpan@vger.kernel.org W: http://wpan.cakelab.org/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/sschmidt/wpan.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/sschmidt/wpan-next.git S: Maintained F: net/ieee802154/ F: net/mac802154/ -- cgit From 581fe0ea61584d88072527ae9fb9dcb9d1f2783e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Sep 2017 19:42:37 -0400 Subject: net: orphan frags on stand-alone ptype in dev_queue_xmit_nit Zerocopy skbs frags are copied when the skb is looped to a local sock. Commit 1080e512d44d ("net: orphan frags on receive") introduced calls to skb_orphan_frags to deliver_skb and __netif_receive_skb for this. With msg_zerocopy, these skbs can also exist in the tx path and thus loop from dev_queue_xmit_nit. This already calls deliver_skb in its loop. But it does not orphan before a separate pt_prev->func(). Add the missing skb_orphan_frags_rx. Changes v1->v2: handle skb_orphan_frags_rx failure Fixes: 1f8b977ab32d ("sock: enable MSG_ZEROCOPY") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 9a2254f9802f..588b473194a8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1948,8 +1948,12 @@ again: goto again; } out_unlock: - if (pt_prev) - pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + if (pt_prev) { + if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) + pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + else + kfree_skb(skb2); + } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); -- cgit From cbb2fb5c72f48d3029c144be0f0e61da1c7bccf7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 22 Sep 2017 20:20:06 -0400 Subject: net: set tb->fast_sk_family We need to set the tb->fast_sk_family properly so we can use the proper comparison function for all subsequent reuseport bind requests. Fixes: 637bc8bbe6c0 ("inet: reset tb->fastreuseport when adding a reuseport sk") Reported-and-tested-by: Cole Robinson Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b9c64b40a83a..f87f4805e244 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -328,6 +328,7 @@ success: tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif @@ -354,6 +355,7 @@ success: tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif -- cgit From 7a56673b58f2414679e926bba80309a037a4fd35 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 22 Sep 2017 20:20:07 -0400 Subject: net: use inet6_rcv_saddr to compare sockets In ipv6_rcv_saddr_equal() we need to use inet6_rcv_saddr(sk) for the ipv6 compare with the fast socket information to make sure we're doing the proper comparisons. Fixes: 637bc8bbe6c0 ("inet: reset tb->fastreuseport when adding a reuseport sk") Reported-and-tested-by: Cole Robinson Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f87f4805e244..a1bf30438bc5 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -266,7 +266,7 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, #if IS_ENABLED(CONFIG_IPV6) if (tb->fast_sk_family == AF_INET6) return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr, - &sk->sk_v6_rcv_saddr, + inet6_rcv_saddr(sk), tb->fast_rcv_saddr, sk->sk_rcv_saddr, tb->fast_ipv6_only, -- cgit From fbed24bcc69d3e48c5402c371f19f5c7688871e5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 22 Sep 2017 20:20:08 -0400 Subject: inet: fix improper empty comparison When doing my reuseport rework I screwed up and changed a if (hlist_empty(&tb->owners)) to if (!hlist_empty(&tb->owners)) This is obviously bad as all of the reuseport/reuse logic was reversed, which caused weird problems like allowing an ipv4 bind conflict if we opened an ipv4 only socket on a port followed by an ipv6 only socket on the same port. Fixes: b9470c27607b ("inet: kill smallest_size and smallest_port") Reported-by: Cole Robinson Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a1bf30438bc5..c039c937ba90 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -321,7 +321,7 @@ tb_found: goto fail_unlock; } success: - if (!hlist_empty(&tb->owners)) { + if (hlist_empty(&tb->owners)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = FASTREUSEPORT_ANY; -- cgit