From 3aa569c3fedbd0d16041d08bf6e89b8c43aee650 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 18 Jul 2012 12:42:58 +0200 Subject: mac80211: fix scan_sdata assignment We need to use RCU to assign scan_sdata. Signed-off-by: Johannes Berg --- net/mac80211/scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 169da0742c81..7644181cb6b4 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -304,7 +304,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted, if (local->scan_req != local->int_scan_req) cfg80211_scan_done(local->scan_req, aborted); local->scan_req = NULL; - local->scan_sdata = NULL; + rcu_assign_pointer(local->scan_sdata, NULL); local->scanning = 0; local->scan_channel = NULL; -- cgit From 5e31fc0815a4e2c72b1b495fe7a0d8f9bfb9e4b4 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 24 Jul 2012 08:35:39 +0200 Subject: wireless: reg: restore previous behaviour of chan->max_power calculations commit eccc068e8e84c8fe997115629925e0422a98e4de Author: Hong Wu Date: Wed Jan 11 20:33:39 2012 +0200 wireless: Save original maximum regulatory transmission power for the calucation of the local maximum transmit pow changed the way we calculate chan->max_power as min(chan->max_power, chan->max_reg_power). That broke rt2x00 (and perhaps some other drivers) that do not set chan->max_power. It is not so easy to fix this problem correctly in rt2x00. According to commit eccc068e8 changelog, change claim only to save maximum regulatory power - changing setting of chan->max_power was side effect. This patch restore previous calculations of chan->max_power and do not touch chan->max_reg_power. Cc: stable@vger.kernel.org # 3.4+ Signed-off-by: Stanislaw Gruszka Acked-by: Luis R. Rodriguez Signed-off-by: Johannes Berg --- net/wireless/reg.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index baf5704740ee..460af03d8149 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -891,7 +891,21 @@ static void handle_channel(struct wiphy *wiphy, chan->max_antenna_gain = min(chan->orig_mag, (int) MBI_TO_DBI(power_rule->max_antenna_gain)); chan->max_reg_power = (int) MBM_TO_DBM(power_rule->max_eirp); - chan->max_power = min(chan->max_power, chan->max_reg_power); + if (chan->orig_mpwr) { + /* + * Devices that have their own custom regulatory domain + * but also use WIPHY_FLAG_STRICT_REGULATORY will follow the + * passed country IE power settings. + */ + if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE && + wiphy->flags & WIPHY_FLAG_CUSTOM_REGULATORY && + wiphy->flags & WIPHY_FLAG_STRICT_REGULATORY) + chan->max_power = chan->max_reg_power; + else + chan->max_power = min(chan->orig_mpwr, + chan->max_reg_power); + } else + chan->max_power = chan->max_reg_power; } static void handle_band(struct wiphy *wiphy, -- cgit From ba846a502c6b3c0ff047861c891fd1afeed6e435 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Sun, 29 Jul 2012 16:25:10 +0300 Subject: mac80211: don't clear sched_scan_sdata on sched scan stop request ieee80211_request_sched_scan_stop() cleared local->sched_scan_sdata. However, sched_scan_sdata should be cleared only after the driver calls ieee80211_sched_scan_stopped() (like with normal hw scan). Clearing sched_scan_sdata too early caused ieee80211_sched_scan_stopped_work to exit prematurely without properly cleaning all the sched scan resources and without calling cfg80211_sched_scan_stopped (so userspace wasn't notified about sched scan completion). Signed-off-by: Eliad Peller Signed-off-by: Johannes Berg --- net/mac80211/scan.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index df36280ed78f..839dd9737989 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -984,7 +984,6 @@ int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata) kfree(local->sched_scan_ies.ie[i]); drv_sched_scan_stop(local, sdata); - rcu_assign_pointer(local->sched_scan_sdata, NULL); } out: mutex_unlock(&local->mtx); -- cgit From 2d9957cce674308f744f37f68b6bc3261bfdbbf4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 1 Aug 2012 20:54:52 +0200 Subject: mac80211: clear timer bits when disconnecting There's a corner case that can happen when we suspend with a timer running, then resume and disconnect. If we connect again, suspend and resume we might start timers that shouldn't be running. Reset the timer flags to avoid this. This affects both mesh and managed modes. Signed-off-by: Johannes Berg --- net/mac80211/mesh.c | 2 ++ net/mac80211/mlme.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 6fac18c0423f..d60b3d362e00 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -634,6 +634,8 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) local->fif_other_bss--; atomic_dec(&local->iff_allmultis); ieee80211_configure_filter(local); + + sdata->u.mesh.timers_running = 0; } static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index cef0c9e79aba..a4a5acdbaa4d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1430,6 +1430,8 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, del_timer_sync(&sdata->u.mgd.bcn_mon_timer); del_timer_sync(&sdata->u.mgd.timer); del_timer_sync(&sdata->u.mgd.chswitch_timer); + + sdata->u.mgd.timers_running = 0; } void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata, -- cgit From dd4c9260e7f23f2e951cbfb2726e468c6d30306c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 1 Aug 2012 21:03:21 +0200 Subject: mac80211: cancel mesh path timer The mesh path timer needs to be canceled when leaving the mesh as otherwise it could fire after the interface has been removed already. Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg --- net/mac80211/mesh.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index d60b3d362e00..85572353a7e3 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -622,6 +622,7 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) del_timer_sync(&sdata->u.mesh.housekeeping_timer); del_timer_sync(&sdata->u.mesh.mesh_path_root_timer); + del_timer_sync(&sdata->u.mesh.mesh_path_timer); /* * If the timer fired while we waited for it, it will have * requeued the work. Now the work will be running again -- cgit From 30b678d844af3305cda5953467005cebb5d7b687 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 30 Jul 2012 15:57:00 +0000 Subject: net: Allow driver to limit number of GSO segments per skb A peer (or local user) may cause TCP to use a nominal MSS of as little as 88 (actual MSS of 76 with timestamps). Given that we have a sufficiently prodigious local sender and the peer ACKs quickly enough, it is nevertheless possible to grow the window for such a connection to the point that we will try to send just under 64K at once. This results in a single skb that expands to 861 segments. In some drivers with TSO support, such an skb will require hundreds of DMA descriptors; a substantial fraction of a TX ring or even more than a full ring. The TX queue selected for the skb may stall and trigger the TX watchdog repeatedly (since the problem skb will be retried after the TX reset). This particularly affects sfc, for which the issue is designated as CVE-2012-3412. Therefore: 1. Add the field net_device::gso_max_segs holding the device-specific limit. 2. In netif_skb_features(), if the number of segments is too high then mask out GSO features to force fall back to software GSO. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 0cb3fe8d8e72..f91abf800161 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2134,6 +2134,9 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) __be16 protocol = skb->protocol; netdev_features_t features = skb->dev->features; + if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) + features &= ~NETIF_F_GSO_MASK; + if (protocol == htons(ETH_P_8021Q)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; @@ -5986,6 +5989,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); dev->gso_max_size = GSO_MAX_SIZE; + dev->gso_max_segs = GSO_MAX_SEGS; INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); -- cgit From 1485348d2424e1131ea42efc033cbd9366462b01 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 30 Jul 2012 16:11:42 +0000 Subject: tcp: Apply device TSO segment limit earlier Cache the device gso_max_segs in sock::sk_gso_max_segs and use it to limit the size of TSO skbs. This avoids the need to fall back to software GSO for local TCP senders. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/sock.c | 1 + net/ipv4/tcp.c | 4 +++- net/ipv4/tcp_cong.c | 3 ++- net/ipv4/tcp_output.c | 21 ++++++++++++--------- 4 files changed, 18 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 6b654b3ddfda..8f67ced8d6a8 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1458,6 +1458,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = dst->dev->gso_max_size; + sk->sk_gso_max_segs = dst->dev->gso_max_segs; } } } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e7e6eeae49c0..2109ff4a1daf 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -811,7 +811,9 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, old_size_goal + mss_now > xmit_size_goal)) { xmit_size_goal = old_size_goal; } else { - tp->xmit_size_goal_segs = xmit_size_goal / mss_now; + tp->xmit_size_goal_segs = + min_t(u16, xmit_size_goal / mss_now, + sk->sk_gso_max_segs); xmit_size_goal = tp->xmit_size_goal_segs * mss_now; } } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 4d4db16e336e..1432cdb0644c 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -291,7 +291,8 @@ bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) left = tp->snd_cwnd - in_flight; if (sk_can_gso(sk) && left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && - left * tp->mss_cache < sk->sk_gso_max_size) + left * tp->mss_cache < sk->sk_gso_max_size && + left < sk->sk_gso_max_segs) return true; return left <= tcp_max_tso_deferred_mss(tp); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3f1bcff0b10b..a7b3ec9b6c3e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1522,21 +1522,21 @@ static void tcp_cwnd_validate(struct sock *sk) * when we would be allowed to send the split-due-to-Nagle skb fully. */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, - unsigned int mss_now, unsigned int cwnd) + unsigned int mss_now, unsigned int max_segs) { const struct tcp_sock *tp = tcp_sk(sk); - u32 needed, window, cwnd_len; + u32 needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; - cwnd_len = mss_now * cwnd; + max_len = mss_now * max_segs; - if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk))) - return cwnd_len; + if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) + return max_len; needed = min(skb->len, window); - if (cwnd_len <= needed) - return cwnd_len; + if (max_len <= needed) + return max_len; return needed - needed % mss_now; } @@ -1765,7 +1765,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ - if (limit >= sk->sk_gso_max_size) + if (limit >= min_t(unsigned int, sk->sk_gso_max_size, + sk->sk_gso_max_segs * tp->mss_cache)) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ @@ -1999,7 +2000,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, - cwnd_quota); + min_t(unsigned int, + cwnd_quota, + sk->sk_gso_max_segs)); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) -- cgit From e3c0d04750751389d5116267f8cf4687444d9a50 Mon Sep 17 00:00:00 2001 From: Fan Du Date: Mon, 30 Jul 2012 21:43:54 +0000 Subject: Fix unexpected SA hard expiration after changing date After SA is setup, one timer is armed to detect soft/hard expiration, however the timer handler uses xtime to do the math. This makes hard expiration occurs first before soft expiration after setting new date with big interval. As a result new child SA is deleted before rekeying the new one. Signed-off-by: Fan Du Signed-off-by: David S. Miller --- net/xfrm/xfrm_state.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 5b228f97d4b3..87cd0e4d4282 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -415,8 +415,17 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer * me) if (x->lft.hard_add_expires_seconds) { long tmo = x->lft.hard_add_expires_seconds + x->curlft.add_time - now; - if (tmo <= 0) - goto expired; + if (tmo <= 0) { + if (x->xflags & XFRM_SOFT_EXPIRE) { + /* enter hard expire without soft expire first?! + * setting a new date could trigger this. + * workarbound: fix x->curflt.add_time by below: + */ + x->curlft.add_time = now - x->saved_tmo - 1; + tmo = x->lft.hard_add_expires_seconds - x->saved_tmo; + } else + goto expired; + } if (tmo < next) next = tmo; } @@ -433,10 +442,14 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer * me) if (x->lft.soft_add_expires_seconds) { long tmo = x->lft.soft_add_expires_seconds + x->curlft.add_time - now; - if (tmo <= 0) + if (tmo <= 0) { warn = 1; - else if (tmo < next) + x->xflags &= ~XFRM_SOFT_EXPIRE; + } else if (tmo < next) { next = tmo; + x->xflags |= XFRM_SOFT_EXPIRE; + x->saved_tmo = tmo; + } } if (x->lft.soft_use_expires_seconds) { long tmo = x->lft.soft_use_expires_seconds + -- cgit From e33cdac014d50dd9753e1399ae8b0b5cd98d7aa0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Aug 2012 23:23:40 +0000 Subject: ipv4: route.c cleanup Remove unused includes after IP cache removal Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c035251beb07..e4ba974f143c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -70,7 +70,6 @@ #include #include #include -#include #include #include #include @@ -80,7 +79,6 @@ #include #include #include -#include #include #include #include @@ -88,11 +86,9 @@ #include #include #include -#include #include #include #include -#include #include #include #include -- cgit From 03f6b0843ad6512f27bc2e48f04c21065311e03e Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Wed, 1 Aug 2012 15:58:42 -0500 Subject: cfg80211: add channel flag to prohibit OFDM operation Currently the only way for wireless drivers to tell whether or not OFDM is allowed on the current channel is to check the regulatory information. However, this requires hodling cfg80211_mutex, which is not visible to the drivers. Other regulatory restrictions are provided as flags in the channel definition, so let's do similarly with OFDM. This patch adds a new flag, IEEE80211_CHAN_NO_OFDM, to tell drivers that OFDM on a channel is not allowed. This flag is set on any channels for which regulatory indicates that OFDM is prohibited. Signed-off-by: Seth Forshee Tested-by: Arend van Spriel Signed-off-by: Johannes Berg --- net/wireless/reg.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index a9175fedeb59..cbf30de79c69 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -680,6 +680,8 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_NO_IBSS; if (rd_flags & NL80211_RRF_DFS) channel_flags |= IEEE80211_CHAN_RADAR; + if (rd_flags & NL80211_RRF_NO_OFDM) + channel_flags |= IEEE80211_CHAN_NO_OFDM; return channel_flags; } -- cgit From 899852af60c212bfe9a2fb71d4d9082d2622df5c Mon Sep 17 00:00:00 2001 From: Paul Stewart Date: Wed, 1 Aug 2012 16:54:42 -0700 Subject: cfg80211: Clear "beacon_found" on regulatory restore Restore the default state to the "beacon_found" flag when the channel flags are restored. Otherwise, we can end up with a channel that we can no longer transmit on even when we can see beacons on that channel. Signed-off-by: Paul Stewart Signed-off-by: Johannes Berg --- net/wireless/reg.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index cbf30de79c69..2ded3c7fad06 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1901,6 +1901,7 @@ static void restore_custom_reg_settings(struct wiphy *wiphy) chan->flags = chan->orig_flags; chan->max_antenna_gain = chan->orig_mag; chan->max_power = chan->orig_mpwr; + chan->beacon_found = false; } } } -- cgit From 696ecdc10622d86541f2e35cc16e15b6b3b1b67e Mon Sep 17 00:00:00 2001 From: Hiroaki SHIMODA Date: Fri, 3 Aug 2012 19:57:52 +0900 Subject: net_sched: gact: Fix potential panic in tcf_gact(). gact_rand array is accessed by gact->tcfg_ptype whose value is assumed to less than MAX_RAND, but any range checks are not performed. So add a check in tcf_gact_init(). And in tcf_gact(), we can reduce a branch. Signed-off-by: Hiroaki SHIMODA Signed-off-by: David S. Miller --- net/sched/act_gact.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index f10fb8256442..05d60859d8e3 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -67,6 +67,9 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, struct tcf_common *pc; int ret = 0; int err; +#ifdef CONFIG_GACT_PROB + struct tc_gact_p *p_parm = NULL; +#endif if (nla == NULL) return -EINVAL; @@ -82,6 +85,12 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, #ifndef CONFIG_GACT_PROB if (tb[TCA_GACT_PROB] != NULL) return -EOPNOTSUPP; +#else + if (tb[TCA_GACT_PROB]) { + p_parm = nla_data(tb[TCA_GACT_PROB]); + if (p_parm->ptype >= MAX_RAND) + return -EINVAL; + } #endif pc = tcf_hash_check(parm->index, a, bind, &gact_hash_info); @@ -103,8 +112,7 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, spin_lock_bh(&gact->tcf_lock); gact->tcf_action = parm->action; #ifdef CONFIG_GACT_PROB - if (tb[TCA_GACT_PROB] != NULL) { - struct tc_gact_p *p_parm = nla_data(tb[TCA_GACT_PROB]); + if (p_parm) { gact->tcfg_paction = p_parm->paction; gact->tcfg_pval = p_parm->pval; gact->tcfg_ptype = p_parm->ptype; @@ -133,7 +141,7 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, spin_lock(&gact->tcf_lock); #ifdef CONFIG_GACT_PROB - if (gact->tcfg_ptype && gact_rand[gact->tcfg_ptype] != NULL) + if (gact->tcfg_ptype) action = gact_rand[gact->tcfg_ptype](gact); else action = gact->tcf_action; -- cgit From 269c4845d5b3627b95b1934107251bacbe99bb68 Mon Sep 17 00:00:00 2001 From: Gustavo Padovan Date: Fri, 15 Jun 2012 02:30:20 -0300 Subject: Bluetooth: Fix possible deadlock in SCO code sco_chan_del() only has conn != NULL when called from sco_conn_del() so just move the code from it that deal with conn to sco_conn_del(). [ 120.765529] [ 120.765529] ====================================================== [ 120.766529] [ INFO: possible circular locking dependency detected ] [ 120.766529] 3.5.0-rc1-10292-g3701f94-dirty #70 Tainted: G W [ 120.766529] ------------------------------------------------------- [ 120.766529] kworker/u:3/1497 is trying to acquire lock: [ 120.766529] (&(&conn->lock)->rlock#2){+.+...}, at: [] sco_chan_del+0x4c/0x170 [bluetooth] [ 120.766529] [ 120.766529] but task is already holding lock: [ 120.766529] (slock-AF_BLUETOOTH-BTPROTO_SCO){+.+...}, at: [] sco_conn_del+0x61/0xe0 [bluetooth] [ 120.766529] [ 120.766529] which lock already depends on the new lock. [ 120.766529] [ 120.766529] [ 120.766529] the existing dependency chain (in reverse order) is: [ 120.766529] [ 120.766529] -> #1 (slock-AF_BLUETOOTH-BTPROTO_SCO){+.+...}: [ 120.766529] [] lock_acquire+0x8e/0xb0 [ 120.766529] [] _raw_spin_lock+0x40/0x80 [ 120.766529] [] sco_connect_cfm+0x79/0x300 [bluetooth] [ 120.766529] [] hci_sync_conn_complete_evt.isra.90+0x343/0x400 [bluetooth] [ 120.766529] [] hci_event_packet+0x317/0xfb0 [bluetooth] [ 120.766529] [] hci_rx_work+0x2c8/0x890 [bluetooth] [ 120.766529] [] process_one_work+0x197/0x460 [ 120.766529] [] worker_thread+0x126/0x2d0 [ 120.766529] [] kthread+0x9d/0xb0 [ 120.766529] [] kernel_thread_helper+0x4/0x10 [ 120.766529] [ 120.766529] -> #0 (&(&conn->lock)->rlock#2){+.+...}: [ 120.766529] [] __lock_acquire+0x154a/0x1d30 [ 120.766529] [] lock_acquire+0x8e/0xb0 [ 120.766529] [] _raw_spin_lock+0x40/0x80 [ 120.766529] [] sco_chan_del+0x4c/0x170 [bluetooth] [ 120.766529] [] sco_conn_del+0x74/0xe0 [bluetooth] [ 120.766529] [] sco_disconn_cfm+0x32/0x60 [bluetooth] [ 120.766529] [] hci_disconn_complete_evt.isra.53+0x242/0x390 [bluetooth] [ 120.766529] [] hci_event_packet+0x617/0xfb0 [bluetooth] [ 120.766529] [] hci_rx_work+0x2c8/0x890 [bluetooth] [ 120.766529] [] process_one_work+0x197/0x460 [ 120.766529] [] worker_thread+0x126/0x2d0 [ 120.766529] [] kthread+0x9d/0xb0 [ 120.766529] [] kernel_thread_helper+0x4/0x10 [ 120.766529] [ 120.766529] other info that might help us debug this: [ 120.766529] [ 120.766529] Possible unsafe locking scenario: [ 120.766529] [ 120.766529] CPU0 CPU1 [ 120.766529] ---- ---- [ 120.766529] lock(slock-AF_BLUETOOTH-BTPROTO_SCO); [ 120.766529] lock(&(&conn->lock)->rlock#2); [ 120.766529] lock(slock-AF_BLUETOOTH-BTPROTO_SCO); [ 120.766529] lock(&(&conn->lock)->rlock#2); [ 120.766529] [ 120.766529] *** DEADLOCK *** Signed-off-by: Gustavo Padovan --- net/bluetooth/sco.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 40bbe25dcff7..3589e21edb09 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -131,6 +131,15 @@ static int sco_conn_del(struct hci_conn *hcon, int err) sco_sock_clear_timer(sk); sco_chan_del(sk, err); bh_unlock_sock(sk); + + sco_conn_lock(conn); + conn->sk = NULL; + sco_pi(sk)->conn = NULL; + sco_conn_unlock(conn); + + if (conn->hcon) + hci_conn_put(conn->hcon); + sco_sock_kill(sk); } @@ -821,16 +830,6 @@ static void sco_chan_del(struct sock *sk, int err) BT_DBG("sk %p, conn %p, err %d", sk, conn, err); - if (conn) { - sco_conn_lock(conn); - conn->sk = NULL; - sco_pi(sk)->conn = NULL; - sco_conn_unlock(conn); - - if (conn->hcon) - hci_conn_put(conn->hcon); - } - sk->sk_state = BT_CLOSED; sk->sk_err = err; sk->sk_state_change(sk); -- cgit From a9ea3ed9b71cc3271dd59e76f65748adcaa76422 Mon Sep 17 00:00:00 2001 From: Szymon Janc Date: Thu, 19 Jul 2012 14:46:08 +0200 Subject: Bluetooth: Fix legacy pairing with some devices Some devices e.g. some Android based phones don't do SDP search before pairing and cancel legacy pairing when ACL is disconnected. PIN Code Request event which changes ACL timeout to HCI_PAIRING_TIMEOUT is only received after remote user entered PIN. In that case no L2CAP is connected so default HCI_DISCONN_TIMEOUT (2 seconds) is being used to timeout ACL connection. This results in problems with legacy pairing as remote user has only few seconds to enter PIN before ACL is disconnected. Increase disconnect timeout for incomming connection to HCI_PAIRING_TIMEOUT if SSP is disabled and no linkey exists. To avoid keeping ACL alive for too long after SDP search set ACL timeout back to HCI_DISCONN_TIMEOUT when L2CAP is connected. 2012-07-19 13:24:43.413521 < HCI Command: Create Connection (0x01|0x0005) plen 13 bdaddr 00:02:72:D6:6A:3F ptype 0xcc18 rswitch 0x01 clkoffset 0x0000 Packet type: DM1 DM3 DM5 DH1 DH3 DH5 2012-07-19 13:24:43.425224 > HCI Event: Command Status (0x0f) plen 4 Create Connection (0x01|0x0005) status 0x00 ncmd 1 2012-07-19 13:24:43.885222 > HCI Event: Role Change (0x12) plen 8 status 0x00 bdaddr 00:02:72:D6:6A:3F role 0x01 Role: Slave 2012-07-19 13:24:44.054221 > HCI Event: Connect Complete (0x03) plen 11 status 0x00 handle 42 bdaddr 00:02:72:D6:6A:3F type ACL encrypt 0x00 2012-07-19 13:24:44.054313 < HCI Command: Read Remote Supported Features (0x01|0x001b) plen 2 handle 42 2012-07-19 13:24:44.055176 > HCI Event: Page Scan Repetition Mode Change (0x20) plen 7 bdaddr 00:02:72:D6:6A:3F mode 0 2012-07-19 13:24:44.056217 > HCI Event: Max Slots Change (0x1b) plen 3 handle 42 slots 5 2012-07-19 13:24:44.059218 > HCI Event: Command Status (0x0f) plen 4 Read Remote Supported Features (0x01|0x001b) status 0x00 ncmd 0 2012-07-19 13:24:44.062192 > HCI Event: Command Status (0x0f) plen 4 Unknown (0x00|0x0000) status 0x00 ncmd 1 2012-07-19 13:24:44.067219 > HCI Event: Read Remote Supported Features (0x0b) plen 11 status 0x00 handle 42 Features: 0xbf 0xfe 0xcf 0xfe 0xdb 0xff 0x7b 0x87 2012-07-19 13:24:44.067248 < HCI Command: Read Remote Extended Features (0x01|0x001c) plen 3 handle 42 page 1 2012-07-19 13:24:44.071217 > HCI Event: Command Status (0x0f) plen 4 Read Remote Extended Features (0x01|0x001c) status 0x00 ncmd 1 2012-07-19 13:24:44.076218 > HCI Event: Read Remote Extended Features (0x23) plen 13 status 0x00 handle 42 page 1 max 1 Features: 0x01 0x00 0x00 0x00 0x00 0x00 0x00 0x00 2012-07-19 13:24:44.076249 < HCI Command: Remote Name Request (0x01|0x0019) plen 10 bdaddr 00:02:72:D6:6A:3F mode 2 clkoffset 0x0000 2012-07-19 13:24:44.081218 > HCI Event: Command Status (0x0f) plen 4 Remote Name Request (0x01|0x0019) status 0x00 ncmd 1 2012-07-19 13:24:44.105214 > HCI Event: Remote Name Req Complete (0x07) plen 255 status 0x00 bdaddr 00:02:72:D6:6A:3F name 'uw000951-0' 2012-07-19 13:24:44.105284 < HCI Command: Authentication Requested (0x01|0x0011) plen 2 handle 42 2012-07-19 13:24:44.111207 > HCI Event: Command Status (0x0f) plen 4 Authentication Requested (0x01|0x0011) status 0x00 ncmd 1 2012-07-19 13:24:44.112220 > HCI Event: Link Key Request (0x17) plen 6 bdaddr 00:02:72:D6:6A:3F 2012-07-19 13:24:44.112249 < HCI Command: Link Key Request Negative Reply (0x01|0x000c) plen 6 bdaddr 00:02:72:D6:6A:3F 2012-07-19 13:24:44.115215 > HCI Event: Command Complete (0x0e) plen 10 Link Key Request Negative Reply (0x01|0x000c) ncmd 1 status 0x00 bdaddr 00:02:72:D6:6A:3F 2012-07-19 13:24:44.116215 > HCI Event: PIN Code Request (0x16) plen 6 bdaddr 00:02:72:D6:6A:3F 2012-07-19 13:24:48.099184 > HCI Event: Auth Complete (0x06) plen 3 status 0x13 handle 42 Error: Remote User Terminated Connection 2012-07-19 13:24:48.179182 > HCI Event: Disconn Complete (0x05) plen 4 status 0x00 handle 42 reason 0x13 Reason: Remote User Terminated Connection Cc: stable@vger.kernel.org Signed-off-by: Szymon Janc Acked-by: Johan Hedberg Signed-off-by: Gustavo Padovan --- net/bluetooth/hci_event.c | 7 ++++++- net/bluetooth/l2cap_core.c | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 41ff978a33f9..248632cec11d 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1762,7 +1762,12 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (conn->type == ACL_LINK) { conn->state = BT_CONFIG; hci_conn_hold(conn); - conn->disc_timeout = HCI_DISCONN_TIMEOUT; + + if (!conn->out && !hci_conn_ssp_enabled(conn) && + !hci_find_link_key(hdev, &ev->bdaddr)) + conn->disc_timeout = HCI_PAIRING_TIMEOUT; + else + conn->disc_timeout = HCI_DISCONN_TIMEOUT; } else conn->state = BT_CONNECTED; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index a8964db04bfb..daa149b7003c 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1181,6 +1181,7 @@ static void l2cap_le_conn_ready(struct l2cap_conn *conn) sk = chan->sk; hci_conn_hold(conn->hcon); + conn->hcon->disc_timeout = HCI_DISCONN_TIMEOUT; bacpy(&bt_sk(sk)->src, conn->src); bacpy(&bt_sk(sk)->dst, conn->dst); -- cgit From c810089c27e48b816181b454fcc493d19fdbc2ba Mon Sep 17 00:00:00 2001 From: Ram Malovany Date: Thu, 19 Jul 2012 10:26:09 +0300 Subject: Bluetooth: Fix using NULL inquiry entry If entry wasn't found in the hci_inquiry_cache_lookup_resolve do not resolve the name.This will fix a kernel crash when trying to use NULL pointer. Cc: stable@vger.kernel.org Signed-off-by: Ram Malovany Signed-off-by: Gustavo Padovan --- net/bluetooth/hci_event.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 248632cec11d..b64cfa213bd6 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1365,6 +1365,9 @@ static bool hci_resolve_next_name(struct hci_dev *hdev) return false; e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, NAME_NEEDED); + if (!e) + return false; + if (hci_resolve_name(hdev, e) == 0) { e->name_state = NAME_PENDING; return true; -- cgit From 7cc8380eb10347016d95bf6f9d842c2ae6d12932 Mon Sep 17 00:00:00 2001 From: Ram Malovany Date: Thu, 19 Jul 2012 10:26:10 +0300 Subject: Bluetooth: Fix using a NULL inquiry cache entry If the device was not found in a list of found devices names of which are pending.This may happen in a case when HCI Remote Name Request was sent as a part of incoming connection establishment procedure. Hence there is no need to continue resolving a next name as it will be done upon receiving another Remote Name Request Complete Event. This will fix a kernel crash when trying to use this entry to resolve the next name. Cc: stable@vger.kernel.org Signed-off-by: Ram Malovany Signed-off-by: Gustavo Padovan --- net/bluetooth/hci_event.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index b64cfa213bd6..fe9a3d6d30b0 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1396,12 +1396,18 @@ static void hci_check_pending_name(struct hci_dev *hdev, struct hci_conn *conn, return; e = hci_inquiry_cache_lookup_resolve(hdev, bdaddr, NAME_PENDING); - if (e) { + /* If the device was not found in a list of found devices names of which + * are pending. there is no need to continue resolving a next name as it + * will be done upon receiving another Remote Name Request Complete + * Event */ + if (!e) + return; + + list_del(&e->list); + if (name) { e->name_state = NAME_KNOWN; - list_del(&e->list); - if (name) - mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00, - e->data.rssi, name, name_len); + mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00, + e->data.rssi, name, name_len); } if (hci_resolve_next_name(hdev)) -- cgit From c3e7c0d90b14a3e7ac091d24cef09efb516d587b Mon Sep 17 00:00:00 2001 From: Ram Malovany Date: Thu, 19 Jul 2012 10:26:11 +0300 Subject: Bluetooth: Set name_state to unknown when entry name is empty When the name of the given entry is empty , the state needs to be updated accordingly. Cc: stable@vger.kernel.org Signed-off-by: Ram Malovany Signed-off-by: Gustavo Padovan --- net/bluetooth/hci_event.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index fe9a3d6d30b0..715d7e33fba0 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1408,6 +1408,8 @@ static void hci_check_pending_name(struct hci_dev *hdev, struct hci_conn *conn, e->name_state = NAME_KNOWN; mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00, e->data.rssi, name, name_len); + } else { + e->name_state = NAME_NOT_KNOWN; } if (hci_resolve_next_name(hdev)) -- cgit From d08fd0e712a834d4abb869c0215a702e290bc51e Mon Sep 17 00:00:00 2001 From: Andrei Emeltchenko Date: Thu, 19 Jul 2012 17:03:43 +0300 Subject: Bluetooth: smp: Fix possible NULL dereference smp_chan_create might return NULL so we need to check before dereferencing smp. Signed-off-by: Andrei Emeltchenko Signed-off-by: Gustavo Padovan --- net/bluetooth/smp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 16ef0dc85a0a..901a616c8083 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -579,8 +579,11 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) if (!test_and_set_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) smp = smp_chan_create(conn); + else + smp = conn->smp_chan; - smp = conn->smp_chan; + if (!smp) + return SMP_UNSPECIFIED; smp->preq[0] = SMP_CMD_PAIRING_REQ; memcpy(&smp->preq[1], req, sizeof(*req)); -- cgit From 49dfbb9129c4edb318578de35cc45c555df37884 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2012 12:54:04 +0530 Subject: Bluetooth: Fix socket not getting freed if l2cap channel create fails If l2cap_chan_create() fails then it will return from l2cap_sock_kill since zapped flag of sk is reset. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Gustavo Padovan --- net/bluetooth/l2cap_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index a4bb27e8427e..b94abd30e6f9 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1174,7 +1174,7 @@ static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, int p chan = l2cap_chan_create(); if (!chan) { - l2cap_sock_kill(sk); + sk_free(sk); return NULL; } -- cgit From 1f6fc43e621167492ed4b7f3b4269c584c3d6ccc Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Thu, 2 Aug 2012 18:41:48 +0100 Subject: cfg80211: process pending events when unregistering net device libertas currently calls cfg80211_disconnected() when it is being brought down. This causes an event to be allocated, but since the wdev is already removed from the rdev by the time that the event processing work executes, the event is never processed or freed. http://article.gmane.org/gmane.linux.kernel.wireless.general/95666 Fix this leak, and other possible situations, by processing the event queue when a device is being unregistered. Thanks to Johannes Berg for the suggestion. Signed-off-by: Daniel Drake Cc: stable@vger.kernel.org Reviewed-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/core.c | 5 +++++ net/wireless/core.h | 1 + net/wireless/util.c | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 31b40cc4a9c3..dcd64d5b07aa 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -952,6 +952,11 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, */ synchronize_rcu(); INIT_LIST_HEAD(&wdev->list); + /* + * Ensure that all events have been processed and + * freed. + */ + cfg80211_process_wdev_events(wdev); break; case NETDEV_PRE_UP: if (!(wdev->wiphy->interface_modes & BIT(wdev->iftype))) diff --git a/net/wireless/core.h b/net/wireless/core.h index 5206c6844fd7..bc7430b54771 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -426,6 +426,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype ntype, u32 *flags, struct vif_params *params); void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev); +void cfg80211_process_wdev_events(struct wireless_dev *wdev); int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, diff --git a/net/wireless/util.c b/net/wireless/util.c index 26f8cd30f712..994e2f0cc7a8 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -735,7 +735,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev) wdev->connect_keys = NULL; } -static void cfg80211_process_wdev_events(struct wireless_dev *wdev) +void cfg80211_process_wdev_events(struct wireless_dev *wdev) { struct cfg80211_event *ev; unsigned long flags; -- cgit From caa0bf648cd20a2efbb6558531711e9ce2c6e948 Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Sat, 4 Aug 2012 04:13:26 +0000 Subject: batman-adv: select an internet gateway if none was chosen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a regression introduced by: 2265c141086474bbae55a5bb3afa1ebb78ccaa7c ("batman-adv: gateway election code refactoring") Reported-by: Nicolás Echániz Signed-off-by: Marek Lindner Acked-by: Antonio Quartulli Signed-off-by: Antonio Quartulli Signed-off-by: David S. Miller --- net/batman-adv/gateway_client.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index b421cc49d2cd..fc866f2e4528 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -200,11 +200,11 @@ void batadv_gw_election(struct batadv_priv *bat_priv) if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_CLIENT) goto out; - if (!batadv_atomic_dec_not_zero(&bat_priv->gw_reselect)) - goto out; - curr_gw = batadv_gw_get_selected_gw_node(bat_priv); + if (!batadv_atomic_dec_not_zero(&bat_priv->gw_reselect) && curr_gw) + goto out; + next_gw = batadv_gw_get_best_gw_node(bat_priv); if (curr_gw == next_gw) -- cgit From 8e7dfbc8d1ea9ca9058aa641a8fe795ebca320e2 Mon Sep 17 00:00:00 2001 From: Silviu-Mihai Popescu Date: Sat, 4 Aug 2012 09:31:29 +0000 Subject: tcp_output: fix sparse warning for tcp_wfree Fix sparse warning: * symbol 'tcp_wfree' was not declared. Should it be static? Signed-off-by: Silviu-Mihai Popescu Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a7b3ec9b6c3e..20dfd892c86f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -940,7 +940,7 @@ void __init tcp_tasklet_init(void) * We cant xmit new skbs from this context, as we might already * hold qdisc lock. */ -void tcp_wfree(struct sk_buff *skb) +static void tcp_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); -- cgit From 91d27a8650d5359a7a320daeb35b88cdea15e3a8 Mon Sep 17 00:00:00 2001 From: Sorin Dumitru Date: Mon, 6 Aug 2012 02:35:58 +0000 Subject: llc: free the right skb We are freeing skb instead of nskb, resulting in a double free on skb and a leak from nskb. Signed-off-by: Sorin Dumitru Signed-off-by: David S. Miller --- net/llc/llc_station.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c index 39a8d8924b9c..6828e39ec2ec 100644 --- a/net/llc/llc_station.c +++ b/net/llc/llc_station.c @@ -268,7 +268,7 @@ static int llc_station_ac_send_null_dsap_xid_c(struct sk_buff *skb) out: return rc; free: - kfree_skb(skb); + kfree_skb(nskb); goto out; } @@ -293,7 +293,7 @@ static int llc_station_ac_send_xid_r(struct sk_buff *skb) out: return rc; free: - kfree_skb(skb); + kfree_skb(nskb); goto out; } @@ -322,7 +322,7 @@ static int llc_station_ac_send_test_r(struct sk_buff *skb) out: return rc; free: - kfree_skb(skb); + kfree_skb(nskb); goto out; } -- cgit From 9871f1ad677d95ffeca80e2c21b70af9bfc9cc91 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Mon, 6 Aug 2012 03:55:29 +0000 Subject: ip: fix error handling in ip_finish_output2() __neigh_create() returns either a pointer to struct neighbour or PTR_ERR(). But the caller expects it to return either a pointer or NULL. Replace the NULL check with IS_ERR() check. The bug was introduced in a263b3093641fb1ec377582c90986a7fd0625184 ("ipv4: Make neigh lookups directly in output packet path."). Signed-off-by: Vasily Kulikov Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index ba39a52d18c1..76dde25fb9a0 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -197,7 +197,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) neigh = __ipv4_neigh_lookup_noref(dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); - if (neigh) { + if (!IS_ERR(neigh)) { int res = dst_neigh_output(dst, neigh, skb); rcu_read_unlock_bh(); -- cgit From 47fd92f5a76eb3f5b407773766e7d7fa1a179419 Mon Sep 17 00:00:00 2001 From: Hiroaki SHIMODA Date: Mon, 6 Aug 2012 05:45:48 +0000 Subject: net_sched: act: Delete estimator in error path. Some action modules free struct tcf_common in their error path while estimator is still active. This results in est_timer() dereference freed memory. Add gen_kill_estimator() in ipt, pedit and simple action. Signed-off-by: Hiroaki SHIMODA Signed-off-by: David S. Miller --- net/sched/act_ipt.c | 7 ++++++- net/sched/act_pedit.c | 5 ++++- net/sched/act_simple.c | 5 ++++- 3 files changed, 14 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 60e281ad0f07..58fb3c7aab9e 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -185,7 +185,12 @@ err3: err2: kfree(tname); err1: - kfree(pc); + if (ret == ACT_P_CREATED) { + if (est) + gen_kill_estimator(&pc->tcfc_bstats, + &pc->tcfc_rate_est); + kfree_rcu(pc, tcfc_rcu); + } return err; } diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 26aa2f6ce257..45c53ab067a6 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -74,7 +74,10 @@ static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est, p = to_pedit(pc); keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) { - kfree(pc); + if (est) + gen_kill_estimator(&pc->tcfc_bstats, + &pc->tcfc_rate_est); + kfree_rcu(pc, tcfc_rcu); return -ENOMEM; } ret = ACT_P_CREATED; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 3922f2a2821b..3714f60f0b3c 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -131,7 +131,10 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est, d = to_defact(pc); ret = alloc_defdata(d, defdata); if (ret < 0) { - kfree(pc); + if (est) + gen_kill_estimator(&pc->tcfc_bstats, + &pc->tcfc_rate_est); + kfree_rcu(pc, tcfc_rcu); return ret; } d->tcf_action = parm->action; -- cgit From 5d299f3d3c8a2fbc732b1bf03af36333ccec3130 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Aug 2012 05:09:33 +0000 Subject: net: ipv6: fix TCP early demux IPv6 needs a cookie in dst_check() call. We need to add rx_dst_cookie and provide a family independent sk_rx_dst_set(sk, skb) method to properly support IPv6 TCP early demux. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 +++- net/ipv4/tcp_ipv4.c | 13 ++++++++++--- net/ipv4/tcp_minisocks.c | 2 +- net/ipv6/tcp_ipv6.c | 27 +++++++++++++++++++++++++-- 4 files changed, 39 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2fd2bc9e3c64..85308b90df80 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5392,6 +5392,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); + if (unlikely(sk->sk_rx_dst == NULL)) + inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* * Header prediction. * The code loosely follows the one in the famous @@ -5605,7 +5607,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_ESTABLISHED); if (skb != NULL) { - inet_sk_rx_dst_set(sk, skb); + icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 42b2a6a73092..272241f16fcb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1627,9 +1627,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk->sk_rx_dst = NULL; } } - if (unlikely(sk->sk_rx_dst == NULL)) - inet_sk_rx_dst_set(sk, skb); - if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; @@ -1872,10 +1869,20 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_destructor= tcp_twsk_destructor, }; +static void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; +} + const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, + .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, .net_header_len = sizeof(struct iphdr), diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 232a90c3ec86..d9c9dcef2de3 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -387,7 +387,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_sock *oldtp = tcp_sk(sk); struct tcp_cookie_values *oldcvp = oldtp->cookie_values; - inet_sk_rx_dst_set(newsk, skb); + newicsk->icsk_af_ops->sk_rx_dst_set(newsk, skb); /* TCP Cookie Transactions require space for the cookie pair, * as it differs for each connection. There is no need to diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c66b90f71c9b..5a439e9a4c01 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1447,7 +1447,17 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) opt_skb = skb_clone(skb, sk_gfp_atomic(sk, GFP_ATOMIC)); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + struct dst_entry *dst = sk->sk_rx_dst; + sock_rps_save_rxhash(sk, skb); + if (dst) { + if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || + dst->ops->check(dst, np->rx_dst_cookie) == NULL) { + dst_release(dst); + sk->sk_rx_dst = NULL; + } + } + if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) goto reset; if (opt_skb) @@ -1705,9 +1715,9 @@ static void tcp_v6_early_demux(struct sk_buff *skb) struct dst_entry *dst = sk->sk_rx_dst; struct inet_sock *icsk = inet_sk(sk); if (dst) - dst = dst_check(dst, 0); + dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie); if (dst && - icsk->rx_dst_ifindex == inet6_iif(skb)) + icsk->rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); } } @@ -1719,10 +1729,23 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_destructor= tcp_twsk_destructor, }; +static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct rt6_info *rt = (const struct rt6_info *)dst; + + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + if (rt->rt6i_node) + inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; +} + static const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, + .sk_rx_dst_set = inet6_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct ipv6hdr), -- cgit From 99aa3473e672ca610905838997fa018b95cd643f Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Mon, 6 Aug 2012 16:27:10 +0000 Subject: af_packet: Quiet sparse noise about using plain integer as NULL pointer Quiets the sparse warning: warning: Using plain integer as NULL pointer Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/packet/af_packet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ceaca7c134a0..f016f6634a79 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1079,7 +1079,7 @@ static void *packet_current_rx_frame(struct packet_sock *po, default: WARN(1, "TPACKET version not supported\n"); BUG(); - return 0; + return NULL; } } -- cgit From 0c03eca3d995e73d691edea8c787e25929ec156d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Aug 2012 00:47:11 +0000 Subject: net: fib: fix incorrect call_rcu_bh() After IP route cache removal, I believe rcu_bh() has very little use and we should remove this RCU variant, since it adds some cycles in fast path. Anyway, the call_rcu_bh() use in fib_true is obviously wrong, since some users only assert rcu_read_lock(). Signed-off-by: Eric Dumazet Cc: "Paul E. McKenney" Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index f0cdb30921c0..57bd978483e1 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -367,7 +367,7 @@ static void __leaf_free_rcu(struct rcu_head *head) static inline void free_leaf(struct leaf *l) { - call_rcu_bh(&l->rcu, __leaf_free_rcu); + call_rcu(&l->rcu, __leaf_free_rcu); } static inline void free_leaf_info(struct leaf_info *leaf) -- cgit From a37e6e344910a43b9ebc2bbf29a029f5ea942598 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Aug 2012 10:55:45 +0000 Subject: net: force dst_default_metrics to const section While investigating on network performance problems, I found this little gem : $ nm -v vmlinux | grep -1 dst_default_metrics ffffffff82736540 b busy.46605 ffffffff82736560 B dst_default_metrics ffffffff82736598 b dst_busy_list Apparently, declaring a const array without initializer put it in (writeable) bss section, in middle of possibly often dirtied cache lines. Since we really want dst_default_metrics be const to avoid any possible false sharing and catch any buggy writes, I force a null initializer. ffffffff818a4c20 R dst_default_metrics Signed-off-by: Eric Dumazet Cc: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dst.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dst.c b/net/core/dst.c index 069d51d29414..56d63612e1e4 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -149,7 +149,15 @@ int dst_discard(struct sk_buff *skb) } EXPORT_SYMBOL(dst_discard); -const u32 dst_default_metrics[RTAX_MAX]; +const u32 dst_default_metrics[RTAX_MAX + 1] = { + /* This initializer is needed to force linker to place this variable + * into const section. Otherwise it might end into bss section. + * We really want to avoid false sharing on this variable, and catch + * any writes on it. + */ + [RTAX_MAX] = 0xdeadbeef, +}; + void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags) -- cgit From be72f63b4cd5d6778cf7eb0d0a86f18f96b51c0c Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 7 Aug 2012 07:27:25 +0000 Subject: sched: add missing group change to qfq_change_class [Resending again, as the text was corrupted by the email client] To speed up operations, QFQ internally divides classes into groups. Which group a class belongs to depends on the ratio between the maximum packet length and the weight of the class. Unfortunately the function qfq_change_class lacks the steps for changing the group of a class when the ratio max_pkt_len/weight of the class changes. For example, when the last of the following three commands is executed, the group of class 1:1 is not correctly changed: tc disc add dev XXX root handle 1: qfq tc class add dev XXX parent 1: qfq classid 1:1 weight 1 tc class change dev XXX parent 1: classid 1:1 qfq weight 4 Not changing the group of a class does not affect the long-term bandwidth guaranteed to the class, as the latter is independent of the maximum packet length, and correctly changes (only) if the weight of the class changes. In contrast, if the group of the class is not updated, the class is still guaranteed the short-term bandwidth and packet delay related to its old group, instead of the guarantees that it should receive according to its new weight and/or maximum packet length. This may also break service guarantees for other classes. This patch adds the missing operations. Signed-off-by: Paolo Valente Signed-off-by: David S. Miller --- net/sched/sch_qfq.c | 95 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 69 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 9af01f3df18c..e4723d31fdd5 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -203,6 +203,34 @@ out: return index; } +/* Length of the next packet (0 if the queue is empty). */ +static unsigned int qdisc_peek_len(struct Qdisc *sch) +{ + struct sk_buff *skb; + + skb = sch->ops->peek(sch); + return skb ? qdisc_pkt_len(skb) : 0; +} + +static void qfq_deactivate_class(struct qfq_sched *, struct qfq_class *); +static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, + unsigned int len); + +static void qfq_update_class_params(struct qfq_sched *q, struct qfq_class *cl, + u32 lmax, u32 inv_w, int delta_w) +{ + int i; + + /* update qfq-specific data */ + cl->lmax = lmax; + cl->inv_w = inv_w; + i = qfq_calc_index(cl->inv_w, cl->lmax); + + cl->grp = &q->groups[i]; + + q->wsum += delta_w; +} + static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg) { @@ -250,6 +278,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, lmax = 1UL << QFQ_MTU_SHIFT; if (cl != NULL) { + bool need_reactivation = false; + if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, &cl->rate_est, qdisc_root_sleeping_lock(sch), @@ -258,12 +288,29 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return err; } - if (inv_w != cl->inv_w) { - sch_tree_lock(sch); - q->wsum += delta_w; - cl->inv_w = inv_w; - sch_tree_unlock(sch); + if (lmax == cl->lmax && inv_w == cl->inv_w) + return 0; /* nothing to update */ + + i = qfq_calc_index(inv_w, lmax); + sch_tree_lock(sch); + if (&q->groups[i] != cl->grp && cl->qdisc->q.qlen > 0) { + /* + * shift cl->F back, to not charge the + * class for the not-yet-served head + * packet + */ + cl->F = cl->S; + /* remove class from its slot in the old group */ + qfq_deactivate_class(q, cl); + need_reactivation = true; } + + qfq_update_class_params(q, cl, lmax, inv_w, delta_w); + + if (need_reactivation) /* activate in new group */ + qfq_activate_class(q, cl, qdisc_peek_len(cl->qdisc)); + sch_tree_unlock(sch); + return 0; } @@ -273,11 +320,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cl->refcnt = 1; cl->common.classid = classid; - cl->lmax = lmax; - cl->inv_w = inv_w; - i = qfq_calc_index(cl->inv_w, cl->lmax); - cl->grp = &q->groups[i]; + qfq_update_class_params(q, cl, lmax, inv_w, delta_w); cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); @@ -294,7 +338,6 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return err; } } - q->wsum += weight; sch_tree_lock(sch); qdisc_class_hash_insert(&q->clhash, &cl->common); @@ -711,15 +754,6 @@ static void qfq_update_eligible(struct qfq_sched *q, u64 old_V) } } -/* What is length of next packet in queue (0 if queue is empty) */ -static unsigned int qdisc_peek_len(struct Qdisc *sch) -{ - struct sk_buff *skb; - - skb = sch->ops->peek(sch); - return skb ? qdisc_pkt_len(skb) : 0; -} - /* * Updates the class, returns true if also the group needs to be updated. */ @@ -843,11 +877,8 @@ static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); - struct qfq_group *grp; struct qfq_class *cl; int err; - u64 roundedS; - int s; cl = qfq_classify(skb, sch, &err); if (cl == NULL) { @@ -876,11 +907,25 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) return err; /* If reach this point, queue q was idle */ - grp = cl->grp; + qfq_activate_class(q, cl, qdisc_pkt_len(skb)); + + return err; +} + +/* + * Handle class switch from idle to backlogged. + */ +static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, + unsigned int pkt_len) +{ + struct qfq_group *grp = cl->grp; + u64 roundedS; + int s; + qfq_update_start(q, cl); /* compute new finish time and rounded start. */ - cl->F = cl->S + (u64)qdisc_pkt_len(skb) * cl->inv_w; + cl->F = cl->S + (u64)pkt_len * cl->inv_w; roundedS = qfq_round_down(cl->S, grp->slot_shift); /* @@ -917,8 +962,6 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) skip_update: qfq_slot_insert(grp, cl, roundedS); - - return err; } -- cgit From 155e4e12b9f49c2dc817bb4c44e9416c46833c3d Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Tue, 7 Aug 2012 08:32:34 +0000 Subject: batman-adv: Fix mem leak in the batadv_tt_local_event() function Memory is allocated for 'tt_change_node' with kmalloc(). 'tt_change_node' may go out of scope really being used for anything (except have a few members initialized) if we hit the 'del:' label. This patch makes sure we free the memory in that case. Signed-off-by: Jesper Juhl Acked-by: Antonio Quartulli Signed-off-by: David S. Miller --- net/batman-adv/translation-table.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index a438f4b582fc..99dd8f75b3ff 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -197,6 +197,7 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv, del: list_del(&entry->list); kfree(entry); + kfree(tt_change_node); event_removed = true; goto unlock; } -- cgit From 7364e445f62825758fa61195d237a5b8ecdd06ec Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Wed, 8 Aug 2012 00:33:25 +0000 Subject: net/core: Fix potential memory leak in dev_set_alias() Do not leak memory by updating pointer with potentially NULL realloc return value. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index f91abf800161..a39354ee1432 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1055,6 +1055,8 @@ rollback: */ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) { + char *new_ifalias; + ASSERT_RTNL(); if (len >= IFALIASZ) @@ -1068,9 +1070,10 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) return 0; } - dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); - if (!dev->ifalias) + new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); + if (!new_ifalias) return -ENOMEM; + dev->ifalias = new_ifalias; strlcpy(dev->ifalias, alias, len+1); return len; -- cgit From 36471012e2ae28ca3178f84d4687a2d88a36593e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Aug 2012 11:19:13 +0200 Subject: tcp: must free metrics at net dismantle We currently leak all tcp metrics at struct net dismantle time. tcp_net_metrics_exit() frees the hash table, we must first iterate it to free all metrics. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 2288a6399e1e..0abe67bb4d3a 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -731,6 +731,18 @@ static int __net_init tcp_net_metrics_init(struct net *net) static void __net_exit tcp_net_metrics_exit(struct net *net) { + unsigned int i; + + for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) { + struct tcp_metrics_block *tm, *next; + + tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1); + while (tm) { + next = rcu_dereference_protected(tm->tcpm_next, 1); + kfree(tm); + tm = next; + } + } kfree(net->ipv4.tcp_metrics_hash); } -- cgit From 3a7c384ffd57ef5fbd95f48edaa2ca4eb3d9f2ee Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Aug 2012 13:56:06 +0000 Subject: ipv4: tcp: unicast_sock should not land outside of TCP stack commit be9f4a44e7d41cee (ipv4: tcp: remove per net tcp_sock) added a selinux regression, reported and bisected by John Stultz selinux_ip_postroute_compat() expect to find a valid sk->sk_security pointer, but this field is NULL for unicast_sock It turns out that unicast_sock are really temporary stuff to be able to reuse part of IP stack (ip_append_data()/ip_push_pending_frames()) Fact is that frames sent by ip_send_unicast_reply() should be orphaned to not fool LSM. Note IPv6 never had this problem, as tcp_v6_send_response() doesnt use a fake socket at all. I'll probably implement tcp_v4_send_response() to remove these unicast_sock in linux-3.7 Reported-by: John Stultz Bisected-by: John Stultz Signed-off-by: Eric Dumazet Cc: Paul Moore Cc: Eric Paris Cc: "Serge E. Hallyn" Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 76dde25fb9a0..ec410e08b4b9 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1536,6 +1536,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; + skb_orphan(nskb); skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); ip_push_pending_frames(sk, &fl4); } -- cgit From 63d02d157ec4124990258d66517b6c11fd6df0cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Aug 2012 14:11:00 +0000 Subject: net: tcp: ipv6_mapped needs sk_rx_dst_set method commit 5d299f3d3c8a2fb (net: ipv6: fix TCP early demux) added a regression for ipv6_mapped case. [ 67.422369] SELinux: initialized (dev autofs, type autofs), uses genfs_contexts [ 67.449678] SELinux: initialized (dev autofs, type autofs), uses genfs_contexts [ 92.631060] BUG: unable to handle kernel NULL pointer dereference at (null) [ 92.631435] IP: [< (null)>] (null) [ 92.631645] PGD 0 [ 92.631846] Oops: 0010 [#1] SMP [ 92.632095] Modules linked in: autofs4 sunrpc ipv6 dm_mirror dm_region_hash dm_log dm_multipath dm_mod video sbs sbshc battery ac lp parport sg snd_hda_intel snd_hda_codec snd_seq_oss snd_seq_midi_event snd_seq snd_seq_device pcspkr snd_pcm_oss snd_mixer_oss snd_pcm snd_timer serio_raw button floppy snd i2c_i801 i2c_core soundcore snd_page_alloc shpchp ide_cd_mod cdrom microcode ehci_hcd ohci_hcd uhci_hcd [ 92.634294] CPU 0 [ 92.634294] Pid: 4469, comm: sendmail Not tainted 3.6.0-rc1 #3 [ 92.634294] RIP: 0010:[<0000000000000000>] [< (null)>] (null) [ 92.634294] RSP: 0018:ffff880245fc7cb0 EFLAGS: 00010282 [ 92.634294] RAX: ffffffffa01985f0 RBX: ffff88024827ad00 RCX: 0000000000000000 [ 92.634294] RDX: 0000000000000218 RSI: ffff880254735380 RDI: ffff88024827ad00 [ 92.634294] RBP: ffff880245fc7cc8 R08: 0000000000000001 R09: 0000000000000000 [ 92.634294] R10: 0000000000000000 R11: ffff880245fc7bf8 R12: ffff880254735380 [ 92.634294] R13: ffff880254735380 R14: 0000000000000000 R15: 7fffffffffff0218 [ 92.634294] FS: 00007f4516ccd6f0(0000) GS:ffff880256600000(0000) knlGS:0000000000000000 [ 92.634294] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 92.634294] CR2: 0000000000000000 CR3: 0000000245ed1000 CR4: 00000000000007f0 [ 92.634294] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 92.634294] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 92.634294] Process sendmail (pid: 4469, threadinfo ffff880245fc6000, task ffff880254b8cac0) [ 92.634294] Stack: [ 92.634294] ffffffff813837a7 ffff88024827ad00 ffff880254b6b0e8 ffff880245fc7d68 [ 92.634294] ffffffff81385083 00000000001d2680 ffff8802547353a8 ffff880245fc7d18 [ 92.634294] ffffffff8105903a ffff88024827ad60 0000000000000002 00000000000000ff [ 92.634294] Call Trace: [ 92.634294] [] ? tcp_finish_connect+0x2c/0xfa [ 92.634294] [] tcp_rcv_state_process+0x2b6/0x9c6 [ 92.634294] [] ? sched_clock_cpu+0xc3/0xd1 [ 92.634294] [] ? local_clock+0x2b/0x3c [ 92.634294] [] tcp_v4_do_rcv+0x63a/0x670 [ 92.634294] [] release_sock+0x128/0x1bd [ 92.634294] [] __inet_stream_connect+0x1b1/0x352 [ 92.634294] [] ? lock_sock_nested+0x74/0x7f [ 92.634294] [] ? wake_up_bit+0x25/0x25 [ 92.634294] [] ? lock_sock_nested+0x74/0x7f [ 92.634294] [] ? inet_stream_connect+0x22/0x4b [ 92.634294] [] inet_stream_connect+0x33/0x4b [ 92.634294] [] sys_connect+0x78/0x9e [ 92.634294] [] ? sysret_check+0x1b/0x56 [ 92.634294] [] ? __audit_syscall_entry+0x195/0x1c8 [ 92.634294] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [ 92.634294] [] system_call_fastpath+0x16/0x1b [ 92.634294] Code: Bad RIP value. [ 92.634294] RIP [< (null)>] (null) [ 92.634294] RSP [ 92.634294] CR2: 0000000000000000 [ 92.648982] ---[ end trace 24e2bed94314c8d9 ]--- [ 92.649146] Kernel panic - not syncing: Fatal exception in interrupt Fix this using inet_sk_rx_dst_set(), and export this function in case IPv6 is modular. Reported-by: Andrew Morton Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv6/tcp_ipv6.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 272241f16fcb..767823764016 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1869,7 +1869,7 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_destructor= tcp_twsk_destructor, }; -static void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -1877,6 +1877,7 @@ static void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; } +EXPORT_SYMBOL(inet_sk_rx_dst_set); const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5a439e9a4c01..bb9ce2b2f377 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1777,6 +1777,7 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, + .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct iphdr), -- cgit From e9324b2ce656e1910d2385b9b47a2f926456dbe3 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 9 Aug 2012 10:08:45 +0000 Subject: netfilter: nf_ct_sip: fix helper name Commit 3a8fc53a (netfilter: nf_ct_helper: allocate 16 bytes for the helper and policy names) introduced a bug in the SIP helper, the helper name is sprinted to the sip_names array instead of instead of into the helper structure. This breaks the helper match and the /proc/net/nf_conntrack_expect output. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_sip.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 758a1bacc126..2fb666920cc9 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1515,7 +1515,6 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff, } static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly; -static char sip_names[MAX_PORTS][4][sizeof("sip-65535")] __read_mostly; static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = { [SIP_EXPECT_SIGNALLING] = { @@ -1585,9 +1584,9 @@ static int __init nf_conntrack_sip_init(void) sip[i][j].me = THIS_MODULE; if (ports[i] == SIP_PORT) - sprintf(sip_names[i][j], "sip"); + sprintf(sip[i][j].name, "sip"); else - sprintf(sip_names[i][j], "sip-%u", i); + sprintf(sip[i][j].name, "sip-%u", i); pr_debug("port #%u: %u\n", i, ports[i]); -- cgit From 02b69cbdc2fb2e1bfbfd9ac0c246d7be1b08d3cd Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 9 Aug 2012 10:08:46 +0000 Subject: netfilter: nf_ct_sip: fix IPv6 address parsing Within SIP messages IPv6 addresses are enclosed in square brackets in most cases, with the exception of the "received=" header parameter. Currently the helper fails to parse enclosed addresses. This patch: - changes the SIP address parsing function to enforce square brackets when required, and accept them when not required but present, as recommended by RFC 5118. - adds a new SDP address parsing function that never accepts square brackets since SDP doesn't use them. With these changes, the SIP helper correctly parses all test messages from RFC 5118 (Session Initiation Protocol (SIP) Torture Test Messages for Internet Protocol Version 6 (IPv6)). Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_nat_sip.c | 4 +- net/netfilter/nf_conntrack_sip.c | 87 ++++++++++++++++++++++++++++++++-------- 2 files changed, 72 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index ea4a23813d26..eef8f29e8bf8 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -173,7 +173,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, * the reply. */ if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, "maddr=", &poff, &plen, - &addr) > 0 && + &addr, true) > 0 && addr.ip == ct->tuplehash[dir].tuple.src.u3.ip && addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) { buflen = sprintf(buffer, "%pI4", @@ -187,7 +187,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, * from which the server received the request. */ if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, "received=", &poff, &plen, - &addr) > 0 && + &addr, false) > 0 && addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip && addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { buflen = sprintf(buffer, "%pI4", diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 2fb666920cc9..5c0a112aeee6 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -183,12 +183,12 @@ static int media_len(const struct nf_conn *ct, const char *dptr, return len + digits_len(ct, dptr, limit, shift); } -static int parse_addr(const struct nf_conn *ct, const char *cp, - const char **endp, union nf_inet_addr *addr, - const char *limit) +static int sip_parse_addr(const struct nf_conn *ct, const char *cp, + const char **endp, union nf_inet_addr *addr, + const char *limit, bool delim) { const char *end; - int ret = 0; + int ret; if (!ct) return 0; @@ -197,16 +197,28 @@ static int parse_addr(const struct nf_conn *ct, const char *cp, switch (nf_ct_l3num(ct)) { case AF_INET: ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); + if (ret == 0) + return 0; break; case AF_INET6: + if (cp < limit && *cp == '[') + cp++; + else if (delim) + return 0; + ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); + if (ret == 0) + return 0; + + if (end < limit && *end == ']') + end++; + else if (delim) + return 0; break; default: BUG(); } - if (ret == 0 || end == cp) - return 0; if (endp) *endp = end; return 1; @@ -219,7 +231,7 @@ static int epaddr_len(const struct nf_conn *ct, const char *dptr, union nf_inet_addr addr; const char *aux = dptr; - if (!parse_addr(ct, dptr, &dptr, &addr, limit)) { + if (!sip_parse_addr(ct, dptr, &dptr, &addr, limit, true)) { pr_debug("ip: %s parse failed.!\n", dptr); return 0; } @@ -296,7 +308,7 @@ int ct_sip_parse_request(const struct nf_conn *ct, return 0; dptr += shift; - if (!parse_addr(ct, dptr, &end, addr, limit)) + if (!sip_parse_addr(ct, dptr, &end, addr, limit, true)) return -1; if (end < limit && *end == ':') { end++; @@ -550,7 +562,7 @@ int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr, if (ret == 0) return ret; - if (!parse_addr(ct, dptr + *matchoff, &c, addr, limit)) + if (!sip_parse_addr(ct, dptr + *matchoff, &c, addr, limit, true)) return -1; if (*c == ':') { c++; @@ -599,7 +611,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr, unsigned int dataoff, unsigned int datalen, const char *name, unsigned int *matchoff, unsigned int *matchlen, - union nf_inet_addr *addr) + union nf_inet_addr *addr, bool delim) { const char *limit = dptr + datalen; const char *start, *end; @@ -613,7 +625,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr, return 0; start += strlen(name); - if (!parse_addr(ct, start, &end, addr, limit)) + if (!sip_parse_addr(ct, start, &end, addr, limit, delim)) return 0; *matchoff = start - dptr; *matchlen = end - start; @@ -675,6 +687,47 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr, return 1; } +static int sdp_parse_addr(const struct nf_conn *ct, const char *cp, + const char **endp, union nf_inet_addr *addr, + const char *limit) +{ + const char *end; + int ret; + + memset(addr, 0, sizeof(*addr)); + switch (nf_ct_l3num(ct)) { + case AF_INET: + ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); + break; + case AF_INET6: + ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); + break; + default: + BUG(); + } + + if (ret == 0) + return 0; + if (endp) + *endp = end; + return 1; +} + +/* skip ip address. returns its length. */ +static int sdp_addr_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + union nf_inet_addr addr; + const char *aux = dptr; + + if (!sdp_parse_addr(ct, dptr, &dptr, &addr, limit)) { + pr_debug("ip: %s parse failed.!\n", dptr); + return 0; + } + + return dptr - aux; +} + /* SDP header parsing: a SDP session description contains an ordered set of * headers, starting with a section containing general session parameters, * optionally followed by multiple media descriptions. @@ -686,10 +739,10 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr, */ static const struct sip_header ct_sdp_hdrs[] = { [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len), - [SDP_HDR_OWNER_IP4] = SDP_HDR("o=", "IN IP4 ", epaddr_len), - [SDP_HDR_CONNECTION_IP4] = SDP_HDR("c=", "IN IP4 ", epaddr_len), - [SDP_HDR_OWNER_IP6] = SDP_HDR("o=", "IN IP6 ", epaddr_len), - [SDP_HDR_CONNECTION_IP6] = SDP_HDR("c=", "IN IP6 ", epaddr_len), + [SDP_HDR_OWNER_IP4] = SDP_HDR("o=", "IN IP4 ", sdp_addr_len), + [SDP_HDR_CONNECTION_IP4] = SDP_HDR("c=", "IN IP4 ", sdp_addr_len), + [SDP_HDR_OWNER_IP6] = SDP_HDR("o=", "IN IP6 ", sdp_addr_len), + [SDP_HDR_CONNECTION_IP6] = SDP_HDR("c=", "IN IP6 ", sdp_addr_len), [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len), }; @@ -775,8 +828,8 @@ static int ct_sip_parse_sdp_addr(const struct nf_conn *ct, const char *dptr, if (ret <= 0) return ret; - if (!parse_addr(ct, dptr + *matchoff, NULL, addr, - dptr + *matchoff + *matchlen)) + if (!sdp_parse_addr(ct, dptr + *matchoff, NULL, addr, + dptr + *matchoff + *matchlen)) return -1; return 1; } -- cgit From f22eb25cf5b1157b29ef88c793b71972efc47143 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 9 Aug 2012 10:08:47 +0000 Subject: netfilter: nf_nat_sip: fix via header translation with multiple parameters Via-headers are parsed beginning at the first character after the Via-address. When the address is translated first and its length decreases, the offset to start parsing at is incorrect and header parameters might be missed. Update the offset after translating the Via-address to fix this. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_nat_sip.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index eef8f29e8bf8..4ad9cf173992 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -148,7 +148,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, hdr, NULL, &matchoff, &matchlen, &addr, &port) > 0) { - unsigned int matchend, poff, plen, buflen, n; + unsigned int olen, matchend, poff, plen, buflen, n; char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; /* We're only interested in headers related to this @@ -163,11 +163,12 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, goto next; } + olen = *datalen; if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, &addr, port)) return NF_DROP; - matchend = matchoff + matchlen; + matchend = matchoff + matchlen + *datalen - olen; /* The maddr= parameter (RFC 2361) specifies where to send * the reply. */ -- cgit From b5ec8eeac46a99004c26791f70b15d001e970acf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 10 Aug 2012 02:22:47 +0000 Subject: ipv4: fix ip_send_skb() ip_send_skb() can send orphaned skb, so we must pass the net pointer to avoid possible NULL dereference in error path. Bug added by commit 3a7c384ffd57 (ipv4: tcp: unicast_sock should not land outside of TCP stack) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 5 ++--- net/ipv4/udp.c | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index ec410e08b4b9..147ccc3e93db 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1366,9 +1366,8 @@ out: return skb; } -int ip_send_skb(struct sk_buff *skb) +int ip_send_skb(struct net *net, struct sk_buff *skb) { - struct net *net = sock_net(skb->sk); int err; err = ip_local_out(skb); @@ -1391,7 +1390,7 @@ int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) return 0; /* Netfilter gets whole the not fragmented skb. */ - return ip_send_skb(skb); + return ip_send_skb(sock_net(sk), skb); } /* diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b4c3582a991f..6f6d1aca3c3d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -758,7 +758,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) uh->check = CSUM_MANGLED_0; send: - err = ip_send_skb(skb); + err = ip_send_skb(sock_net(sk), skb); if (err) { if (err == -ENOBUFS && !inet->recverr) { UDP_INC_STATS_USER(sock_net(sk), -- cgit From 7f5c3e3a80e6654cf48dfba7cf94f88c6b505467 Mon Sep 17 00:00:00 2001 From: "danborkmann@iogearbox.net" Date: Fri, 10 Aug 2012 22:48:54 +0000 Subject: af_packet: remove BUG statement in tpacket_destruct_skb Here's a quote of the comment about the BUG macro from asm-generic/bug.h: Don't use BUG() or BUG_ON() unless there's really no way out; one example might be detecting data structure corruption in the middle of an operation that can't be backed out of. If the (sub)system can somehow continue operating, perhaps with reduced functionality, it's probably not BUG-worthy. If you're tempted to BUG(), think again: is completely giving up really the *only* solution? There are usually better options, where users don't need to reboot ASAP and can mostly shut down cleanly. In our case, the status flag of a ring buffer slot is managed from both sides, the kernel space and the user space. This means that even though the kernel side might work as expected, the user space screws up and changes this flag right between the send(2) is triggered when the flag is changed to TP_STATUS_SENDING and a given skb is destructed after some time. Then, this will hit the BUG macro. As David suggested, the best solution is to simply remove this statement since it cannot be used for kernel side internal consistency checks. I've tested it and the system still behaves /stable/ in this case, so in accordance with the above comment, we should rather remove it. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/packet/af_packet.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index f016f6634a79..8ac890a1a4c0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1936,7 +1936,6 @@ static void tpacket_destruct_skb(struct sk_buff *skb) if (likely(po->tx_ring.pg_vec)) { ph = skb_shinfo(skb)->destructor_arg; - BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING); BUG_ON(atomic_read(&po->tx_ring.pending) == 0); atomic_dec(&po->tx_ring.pending); __packet_set_status(po, ph, TP_STATUS_AVAILABLE); -- cgit From 68e035c950dbceaf660144bf74054dfdfb6aad15 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 14 Aug 2012 12:47:37 +0200 Subject: netfilter: ctnetlink: fix missing locking while changing conntrack from nfqueue Since 9cb017665 netfilter: add glue code to integrate nfnetlink_queue and ctnetlink, we can modify the conntrack entry via nfnl_queue. However, the change of the conntrack entry via nfnetlink_queue requires appropriate locking to avoid concurrent updates. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 14f67a2cbcb5..da4fc37a8578 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1896,10 +1896,15 @@ static int ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct) { struct nlattr *cda[CTA_MAX+1]; + int ret; nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy); - return ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct); + spin_lock_bh(&nf_conntrack_lock); + ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct); + spin_unlock_bh(&nf_conntrack_lock); + + return ret; } static struct nfq_ct_hook ctnetlink_nfqueue_hook = { -- cgit From 47be03a28cc6c80e3aa2b3e8ed6d960ff0c5c0af Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:37 +0000 Subject: netpoll: use GFP_ATOMIC in slave_enable_netpoll() and __netpoll_setup() slave_enable_netpoll() and __netpoll_setup() may be called with read_lock() held, so should use GFP_ATOMIC to allocate memory. Eric suggested to pass gfp flags to __netpoll_setup(). Cc: Eric Dumazet Cc: "David S. Miller" Reported-by: Dan Carpenter Signed-off-by: Eric Dumazet Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 7 ++++--- net/bridge/br_device.c | 12 ++++++------ net/bridge/br_if.c | 2 +- net/bridge/br_private.h | 4 ++-- net/core/netpoll.c | 8 ++++---- 5 files changed, 17 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 73a2a83ee2da..ee4ae0944cef 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -669,19 +669,20 @@ static void vlan_dev_poll_controller(struct net_device *dev) return; } -static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo) +static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo, + gfp_t gfp) { struct vlan_dev_priv *info = vlan_dev_priv(dev); struct net_device *real_dev = info->real_dev; struct netpoll *netpoll; int err = 0; - netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); + netpoll = kzalloc(sizeof(*netpoll), gfp); err = -ENOMEM; if (!netpoll) goto out; - err = __netpoll_setup(netpoll, real_dev); + err = __netpoll_setup(netpoll, real_dev, gfp); if (err) { kfree(netpoll); goto out; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 333484537600..ed0e0f9dc788 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -213,7 +213,8 @@ static void br_netpoll_cleanup(struct net_device *dev) } } -static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) +static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni, + gfp_t gfp) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p, *n; @@ -222,8 +223,7 @@ static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) list_for_each_entry_safe(p, n, &br->port_list, list) { if (!p->dev) continue; - - err = br_netpoll_enable(p); + err = br_netpoll_enable(p, gfp); if (err) goto fail; } @@ -236,17 +236,17 @@ fail: goto out; } -int br_netpoll_enable(struct net_bridge_port *p) +int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp) { struct netpoll *np; int err = 0; - np = kzalloc(sizeof(*p->np), GFP_KERNEL); + np = kzalloc(sizeof(*p->np), gfp); err = -ENOMEM; if (!np) goto out; - err = __netpoll_setup(np, p->dev); + err = __netpoll_setup(np, p->dev, gfp); if (err) { kfree(np); goto out; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index e1144e1617be..171fd6b9bfe6 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -361,7 +361,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (err) goto err2; - if (br_netpoll_info(br) && ((err = br_netpoll_enable(p)))) + if (br_netpoll_info(br) && ((err = br_netpoll_enable(p, GFP_KERNEL)))) goto err3; err = netdev_set_master(dev, br->dev); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a768b2408edf..f507d2af9646 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -316,7 +316,7 @@ static inline void br_netpoll_send_skb(const struct net_bridge_port *p, netpoll_send_skb(np, skb); } -extern int br_netpoll_enable(struct net_bridge_port *p); +extern int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp); extern void br_netpoll_disable(struct net_bridge_port *p); #else static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br) @@ -329,7 +329,7 @@ static inline void br_netpoll_send_skb(const struct net_bridge_port *p, { } -static inline int br_netpoll_enable(struct net_bridge_port *p) +static inline int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp) { return 0; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index b4c90e42b443..37cc854774a4 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -715,7 +715,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt) } EXPORT_SYMBOL(netpoll_parse_options); -int __netpoll_setup(struct netpoll *np, struct net_device *ndev) +int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) { struct netpoll_info *npinfo; const struct net_device_ops *ops; @@ -734,7 +734,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) } if (!ndev->npinfo) { - npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); + npinfo = kmalloc(sizeof(*npinfo), gfp); if (!npinfo) { err = -ENOMEM; goto out; @@ -752,7 +752,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) ops = np->dev->netdev_ops; if (ops->ndo_netpoll_setup) { - err = ops->ndo_netpoll_setup(ndev, npinfo); + err = ops->ndo_netpoll_setup(ndev, npinfo, gfp); if (err) goto free_npinfo; } @@ -857,7 +857,7 @@ int netpoll_setup(struct netpoll *np) refill_skbs(); rtnl_lock(); - err = __netpoll_setup(np, ndev); + err = __netpoll_setup(np, ndev, GFP_KERNEL); rtnl_unlock(); if (err) -- cgit From 38e6bc185d9544dfad1774b3f8902a0b061aea25 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:38 +0000 Subject: netpoll: make __netpoll_cleanup non-block Like the previous patch, slave_disable_netpoll() and __netpoll_cleanup() may be called with read_lock() held too, so we should make them non-block, by moving the cleanup and kfree() to call_rcu_bh() callbacks. Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 6 +----- net/bridge/br_device.c | 6 +----- net/core/netpoll.c | 42 ++++++++++++++++++++++++++++++++---------- 3 files changed, 34 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index ee4ae0944cef..b65623f90660 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -704,11 +704,7 @@ static void vlan_dev_netpoll_cleanup(struct net_device *dev) info->netpoll = NULL; - /* Wait for transmitting packets to finish before freeing. */ - synchronize_rcu_bh(); - - __netpoll_cleanup(netpoll); - kfree(netpoll); + __netpoll_free_rcu(netpoll); } #endif /* CONFIG_NET_POLL_CONTROLLER */ diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index ed0e0f9dc788..f41ba4048c9a 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -267,11 +267,7 @@ void br_netpoll_disable(struct net_bridge_port *p) p->np = NULL; - /* Wait for transmitting packets to finish before freeing. */ - synchronize_rcu_bh(); - - __netpoll_cleanup(np); - kfree(np); + __netpoll_free_rcu(np); } #endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 37cc854774a4..dc17f1db1479 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -878,6 +878,24 @@ static int __init netpoll_init(void) } core_initcall(netpoll_init); +static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) +{ + struct netpoll_info *npinfo = + container_of(rcu_head, struct netpoll_info, rcu); + + skb_queue_purge(&npinfo->arp_tx); + skb_queue_purge(&npinfo->txq); + + /* we can't call cancel_delayed_work_sync here, as we are in softirq */ + cancel_delayed_work(&npinfo->tx_work); + + /* clean after last, unfinished work */ + __skb_queue_purge(&npinfo->txq); + /* now cancel it again */ + cancel_delayed_work(&npinfo->tx_work); + kfree(npinfo); +} + void __netpoll_cleanup(struct netpoll *np) { struct netpoll_info *npinfo; @@ -903,20 +921,24 @@ void __netpoll_cleanup(struct netpoll *np) ops->ndo_netpoll_cleanup(np->dev); RCU_INIT_POINTER(np->dev->npinfo, NULL); + call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info); + } +} +EXPORT_SYMBOL_GPL(__netpoll_cleanup); - /* avoid racing with NAPI reading npinfo */ - synchronize_rcu_bh(); +static void rcu_cleanup_netpoll(struct rcu_head *rcu_head) +{ + struct netpoll *np = container_of(rcu_head, struct netpoll, rcu); - skb_queue_purge(&npinfo->arp_tx); - skb_queue_purge(&npinfo->txq); - cancel_delayed_work_sync(&npinfo->tx_work); + __netpoll_cleanup(np); + kfree(np); +} - /* clean after last, unfinished work */ - __skb_queue_purge(&npinfo->txq); - kfree(npinfo); - } +void __netpoll_free_rcu(struct netpoll *np) +{ + call_rcu_bh(&np->rcu, rcu_cleanup_netpoll); } -EXPORT_SYMBOL_GPL(__netpoll_cleanup); +EXPORT_SYMBOL_GPL(__netpoll_free_rcu); void netpoll_cleanup(struct netpoll *np) { -- cgit From 57c5d46191e75312934c00eba65b13a31ca95120 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:40 +0000 Subject: netpoll: take rcu_read_lock_bh() in netpoll_rx() In __netpoll_rx(), it dereferences ->npinfo without rcu_dereference_bh(), this patch fixes it by using the 'npinfo' passed from netpoll_rx() where it is already dereferenced with rcu_dereference_bh(). Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/netpoll.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index dc17f1db1479..d055bb01328b 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -543,13 +543,12 @@ static void arp_reply(struct sk_buff *skb) spin_unlock_irqrestore(&npinfo->rx_lock, flags); } -int __netpoll_rx(struct sk_buff *skb) +int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) { int proto, len, ulen; int hits = 0; const struct iphdr *iph; struct udphdr *uh; - struct netpoll_info *npinfo = skb->dev->npinfo; struct netpoll *np, *tmp; if (list_empty(&npinfo->rx_np)) -- cgit From 2899656b494dcd118123af1126826b115c8ea6f9 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:42 +0000 Subject: netpoll: take rcu_read_lock_bh() in netpoll_send_skb_on_dev() This patch fixes several problems in the call path of netpoll_send_skb_on_dev(): 1. Disable IRQ's before calling netpoll_send_skb_on_dev(). 2. All the callees of netpoll_send_skb_on_dev() should use rcu_dereference_bh() to dereference ->npinfo. 3. Rename arp_reply() to netpoll_arp_reply(), the former is too generic. Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/netpoll.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index d055bb01328b..174346ac15a0 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -54,7 +54,7 @@ static atomic_t trapped; MAX_UDP_CHUNK) static void zap_completion_queue(void); -static void arp_reply(struct sk_buff *skb); +static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo); static unsigned int carrier_timeout = 4; module_param(carrier_timeout, uint, 0644); @@ -170,7 +170,8 @@ static void poll_napi(struct net_device *dev) list_for_each_entry(napi, &dev->napi_list, dev_list) { if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { - budget = poll_one_napi(dev->npinfo, napi, budget); + budget = poll_one_napi(rcu_dereference_bh(dev->npinfo), + napi, budget); spin_unlock(&napi->poll_lock); if (!budget) @@ -185,13 +186,14 @@ static void service_arp_queue(struct netpoll_info *npi) struct sk_buff *skb; while ((skb = skb_dequeue(&npi->arp_tx))) - arp_reply(skb); + netpoll_arp_reply(skb, npi); } } static void netpoll_poll_dev(struct net_device *dev) { const struct net_device_ops *ops; + struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); if (!dev || !netif_running(dev)) return; @@ -206,17 +208,18 @@ static void netpoll_poll_dev(struct net_device *dev) poll_napi(dev); if (dev->flags & IFF_SLAVE) { - if (dev->npinfo) { + if (ni) { struct net_device *bond_dev = dev->master; struct sk_buff *skb; - while ((skb = skb_dequeue(&dev->npinfo->arp_tx))) { + struct netpoll_info *bond_ni = rcu_dereference_bh(bond_dev->npinfo); + while ((skb = skb_dequeue(&ni->arp_tx))) { skb->dev = bond_dev; - skb_queue_tail(&bond_dev->npinfo->arp_tx, skb); + skb_queue_tail(&bond_ni->arp_tx, skb); } } } - service_arp_queue(dev->npinfo); + service_arp_queue(ni); zap_completion_queue(); } @@ -302,6 +305,7 @@ static int netpoll_owner_active(struct net_device *dev) return 0; } +/* call with IRQ disabled */ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, struct net_device *dev) { @@ -309,8 +313,11 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, unsigned long tries; const struct net_device_ops *ops = dev->netdev_ops; /* It is up to the caller to keep npinfo alive. */ - struct netpoll_info *npinfo = np->dev->npinfo; + struct netpoll_info *npinfo; + + WARN_ON_ONCE(!irqs_disabled()); + npinfo = rcu_dereference_bh(np->dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { __kfree_skb(skb); return; @@ -319,11 +326,9 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, /* don't get messages out of order, and no recursion */ if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { struct netdev_queue *txq; - unsigned long flags; txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); - local_irq_save(flags); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { @@ -347,10 +352,9 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, } WARN_ONCE(!irqs_disabled(), - "netpoll_send_skb(): %s enabled interrupts in poll (%pF)\n", + "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n", dev->name, ops->ndo_start_xmit); - local_irq_restore(flags); } if (status != NETDEV_TX_OK) { @@ -423,9 +427,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) } EXPORT_SYMBOL(netpoll_send_udp); -static void arp_reply(struct sk_buff *skb) +static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo) { - struct netpoll_info *npinfo = skb->dev->npinfo; struct arphdr *arp; unsigned char *arp_ptr; int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; -- cgit From d30362c0712eb567334b3b66de7c40d4372f2c6f Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:43 +0000 Subject: bridge: add some comments for NETDEV_RELEASE Add comments on why we don't notify NETDEV_RELEASE. Cc: David Miller Cc: Stephen Hemminger Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/bridge/br_if.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 171fd6b9bfe6..1c8fdc3558cd 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -427,6 +427,10 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) if (!p || p->br != br) return -EINVAL; + /* Since more than one interface can be attached to a bridge, + * there still maybe an alternate path for netconsole to use; + * therefore there is no reason for a NETDEV_RELEASE event. + */ del_nbp(p); spin_lock_bh(&br->lock); -- cgit From 4e3828c4bfd90b00a951cad7c8da27d1966beefe Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:44 +0000 Subject: bridge: use list_for_each_entry() in netpoll functions We don't delete 'p' from the list in the loop, so we can just use list_for_each_entry(). Cc: David Miller Cc: Stephen Hemminger Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/bridge/br_device.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index f41ba4048c9a..32211fa5b506 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -206,21 +206,20 @@ static void br_poll_controller(struct net_device *br_dev) static void br_netpoll_cleanup(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); - struct net_bridge_port *p, *n; + struct net_bridge_port *p; - list_for_each_entry_safe(p, n, &br->port_list, list) { + list_for_each_entry(p, &br->port_list, list) br_netpoll_disable(p); - } } static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni, gfp_t gfp) { struct net_bridge *br = netdev_priv(dev); - struct net_bridge_port *p, *n; + struct net_bridge_port *p; int err = 0; - list_for_each_entry_safe(p, n, &br->port_list, list) { + list_for_each_entry(p, &br->port_list, list) { if (!p->dev) continue; err = br_netpoll_enable(p, gfp); -- cgit From e15c3c2294605f09f9b336b2f3b97086ab4b8145 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:45 +0000 Subject: netpoll: check netpoll tx status on the right device Although this doesn't matter actually, because netpoll_tx_running() doesn't use the parameter, the code will be more readable. For team_dev_queue_xmit() we have to move it down to avoid compile errors. Cc: David Miller Signed-off-by: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index e9466d412707..02015a505d2a 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -65,7 +65,7 @@ static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) { skb->dev = to->dev; - if (unlikely(netpoll_tx_running(to->dev))) { + if (unlikely(netpoll_tx_running(to->br->dev))) { if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb)) kfree_skb(skb); else { -- cgit From f3da38932b46d1bf171634cc7aae3da405eaf7a6 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:47 +0000 Subject: vlan: clean up some variable names To be consistent, s/info/vlan/. Cc: Benjamin LaHaise Cc: Patrick McHardy Cc: David Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index b65623f90660..0347d48aa7c9 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -672,8 +672,8 @@ static void vlan_dev_poll_controller(struct net_device *dev) static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo, gfp_t gfp) { - struct vlan_dev_priv *info = vlan_dev_priv(dev); - struct net_device *real_dev = info->real_dev; + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + struct net_device *real_dev = vlan->real_dev; struct netpoll *netpoll; int err = 0; @@ -688,7 +688,7 @@ static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *n goto out; } - info->netpoll = netpoll; + vlan->netpoll = netpoll; out: return err; @@ -696,13 +696,13 @@ out: static void vlan_dev_netpoll_cleanup(struct net_device *dev) { - struct vlan_dev_priv *info = vlan_dev_priv(dev); - struct netpoll *netpoll = info->netpoll; + struct vlan_dev_priv *vlan= vlan_dev_priv(dev); + struct netpoll *netpoll = vlan->netpoll; if (!netpoll) return; - info->netpoll = NULL; + vlan->netpoll = NULL; __netpoll_free_rcu(netpoll); } -- cgit From 6eacf8ad8d01c49b95b994b0bf379db2b5b29460 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:48 +0000 Subject: vlan: clean up vlan_dev_hard_start_xmit() Clean up vlan_dev_hard_start_xmit() function. Cc: Benjamin LaHaise Cc: Patrick McHardy Cc: David Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 0347d48aa7c9..402442402af7 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -137,9 +137,21 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, return rc; } +static inline netdev_tx_t vlan_netpoll_send_skb(struct vlan_dev_priv *vlan, struct sk_buff *skb) +{ +#ifdef CONFIG_NET_POLL_CONTROLLER + if (vlan->netpoll) + netpoll_send_skb(vlan->netpoll, skb); +#else + BUG(); +#endif + return NETDEV_TX_OK; +} + static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); unsigned int len; int ret; @@ -150,29 +162,30 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs... */ if (veth->h_vlan_proto != htons(ETH_P_8021Q) || - vlan_dev_priv(dev)->flags & VLAN_FLAG_REORDER_HDR) { + vlan->flags & VLAN_FLAG_REORDER_HDR) { u16 vlan_tci; - vlan_tci = vlan_dev_priv(dev)->vlan_id; + vlan_tci = vlan->vlan_id; vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb); skb = __vlan_hwaccel_put_tag(skb, vlan_tci); } - skb->dev = vlan_dev_priv(dev)->real_dev; + skb->dev = vlan->real_dev; len = skb->len; - if (netpoll_tx_running(dev)) - return skb->dev->netdev_ops->ndo_start_xmit(skb, skb->dev); + if (unlikely(netpoll_tx_running(dev))) + return vlan_netpoll_send_skb(vlan, skb); + ret = dev_queue_xmit(skb); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct vlan_pcpu_stats *stats; - stats = this_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats); + stats = this_cpu_ptr(vlan->vlan_pcpu_stats); u64_stats_update_begin(&stats->syncp); stats->tx_packets++; stats->tx_bytes += len; u64_stats_update_end(&stats->syncp); } else { - this_cpu_inc(vlan_dev_priv(dev)->vlan_pcpu_stats->tx_dropped); + this_cpu_inc(vlan->vlan_pcpu_stats->tx_dropped); } return ret; -- cgit From 689971b44613883ee74ae9c1b31a864aaa3a8e17 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:49 +0000 Subject: netpoll: handle vlan tags in netpoll tx and rx path Without this patch, I can't get netconsole logs remotely over vlan. The reason is probably we don't handle vlan tags in either netpoll tx or rx path. I am not sure if I use these vlan functions correctly, at least this patch works. Cc: Benjamin LaHaise Cc: Patrick McHardy Cc: David Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/netpoll.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 174346ac15a0..e4ba3e70c174 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -334,6 +335,14 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, tries > 0; --tries) { if (__netif_tx_trylock(txq)) { if (!netif_xmit_stopped(txq)) { + if (vlan_tx_tag_present(skb) && + !(netif_skb_features(skb) & NETIF_F_HW_VLAN_TX)) { + skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); + if (unlikely(!skb)) + break; + skb->vlan_tci = 0; + } + status = ops->ndo_start_xmit(skb, dev); if (status == NETDEV_TX_OK) txq_trans_update(txq); @@ -567,6 +576,12 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) return 1; } + if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { + skb = vlan_untag(skb); + if (unlikely(!skb)) + goto out; + } + proto = ntohs(eth_hdr(skb)->h_proto); if (proto != ETH_P_IP) goto out; -- cgit From 6bdb7fe31046ac50b47e83c35cd6c6b6160a475d Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:50 +0000 Subject: netpoll: re-enable irq in poll_napi() napi->poll() needs IRQ enabled, so we have to re-enable IRQ before calling it. Cc: David Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/netpoll.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index e4ba3e70c174..346b1eb83a1f 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -168,16 +168,24 @@ static void poll_napi(struct net_device *dev) struct napi_struct *napi; int budget = 16; + WARN_ON_ONCE(!irqs_disabled()); + list_for_each_entry(napi, &dev->napi_list, dev_list) { + local_irq_enable(); if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { + rcu_read_lock_bh(); budget = poll_one_napi(rcu_dereference_bh(dev->npinfo), napi, budget); + rcu_read_unlock_bh(); spin_unlock(&napi->poll_lock); - if (!budget) + if (!budget) { + local_irq_disable(); break; + } } + local_irq_disable(); } } -- cgit From 7bd86cc282a458b66c41e3f6676de6656c99b8db Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 12 Aug 2012 20:09:59 +0000 Subject: ipv4: Cache local output routes Commit caacf05e5ad1abf causes big drop of UDP loop back performance. The cause of the regression is that we do not cache the local output routes. Each time we send a datagram from unconnected UDP socket, the kernel allocates a dst_entry and adds it to the rt_uncached_list. It creates lock contention on the rt_uncached_lock. Reported-by: Alex Shi Signed-off-by: Yan, Zheng Signed-off-by: David S. Miller --- net/ipv4/route.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e4ba974f143c..fd9ecb52c66b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2028,7 +2028,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) } dev_out = net->loopback_dev; fl4->flowi4_oif = dev_out->ifindex; - res.fi = NULL; flags |= RTCF_LOCAL; goto make_route; } -- cgit From 4855d6f3116e891b66198838b683dce3dcf6e874 Mon Sep 17 00:00:00 2001 From: Igor Maravic Date: Sun, 12 Aug 2012 22:31:58 +0000 Subject: net: ipv6: proc: Fix error handling Fix error handling in case making of dir dev_snmp6 failes Signed-off-by: Igor Maravic Signed-off-by: David S. Miller --- net/ipv6/proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index da2e92d05c15..745a32042950 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -307,10 +307,10 @@ static int __net_init ipv6_proc_init_net(struct net *net) goto proc_dev_snmp6_fail; return 0; +proc_dev_snmp6_fail: + proc_net_remove(net, "snmp6"); proc_snmp6_fail: proc_net_remove(net, "sockstat6"); -proc_dev_snmp6_fail: - proc_net_remove(net, "dev_snmp6"); return -ENOMEM; } -- cgit From 6024935f5ff5f1646bce8404416318e5fd4a0c4a Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 13 Aug 2012 02:49:59 +0000 Subject: llc2: Fix silent failure of llc_station_init() llc_station_init() creates and processes an event skb with no effect other than to change the state from DOWN to UP. Allocation failure is reported, but then ignored by its caller, llc2_init(). Remove this possibility by simply initialising the state as UP. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/llc/llc_station.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c index 6828e39ec2ec..45ddbb93c5d0 100644 --- a/net/llc/llc_station.c +++ b/net/llc/llc_station.c @@ -687,12 +687,8 @@ static void llc_station_rcv(struct sk_buff *skb) llc_station_state_process(skb); } -int __init llc_station_init(void) +void __init llc_station_init(void) { - int rc = -ENOBUFS; - struct sk_buff *skb; - struct llc_station_state_ev *ev; - skb_queue_head_init(&llc_main_station.mac_pdu_q); skb_queue_head_init(&llc_main_station.ev_q.list); spin_lock_init(&llc_main_station.ev_q.lock); @@ -700,20 +696,9 @@ int __init llc_station_init(void) (unsigned long)&llc_main_station); llc_main_station.ack_timer.expires = jiffies + sysctl_llc_station_ack_timeout; - skb = alloc_skb(0, GFP_ATOMIC); - if (!skb) - goto out; - rc = 0; llc_set_station_handler(llc_station_rcv); - ev = llc_station_ev(skb); - memset(ev, 0, sizeof(*ev)); llc_main_station.maximum_retry = 1; - llc_main_station.state = LLC_STATION_STATE_DOWN; - ev->type = LLC_STATION_EV_TYPE_SIMPLE; - ev->prim_type = LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK; - rc = llc_station_next_state(skb); -out: - return rc; + llc_main_station.state = LLC_STATION_STATE_UP; } void __exit llc_station_exit(void) -- cgit From f4f8720febf0d785a054fc09bde5e3ad09728a58 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 13 Aug 2012 02:50:43 +0000 Subject: llc2: Call llc_station_exit() on llc2_init() failure path Otherwise the station packet handler will remain registered even though the module is unloaded. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/llc/af_llc.c | 5 +++-- net/llc/llc_station.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index f6fe4d400502..8c2919ca36f6 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -1206,7 +1206,7 @@ static int __init llc2_init(void) rc = llc_proc_init(); if (rc != 0) { printk(llc_proc_err_msg); - goto out_unregister_llc_proto; + goto out_station; } rc = llc_sysctl_init(); if (rc) { @@ -1226,7 +1226,8 @@ out_sysctl: llc_sysctl_exit(); out_proc: llc_proc_exit(); -out_unregister_llc_proto: +out_station: + llc_station_exit(); proto_unregister(&llc_proto); goto out; } diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c index 45ddbb93c5d0..bba5184fafd7 100644 --- a/net/llc/llc_station.c +++ b/net/llc/llc_station.c @@ -701,7 +701,7 @@ void __init llc_station_init(void) llc_main_station.state = LLC_STATION_STATE_UP; } -void __exit llc_station_exit(void) +void llc_station_exit(void) { llc_set_station_handler(NULL); } -- cgit From aadf31de16a7b2878af00a02e6557df84efa784b Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 13 Aug 2012 02:50:55 +0000 Subject: llc: Fix races between llc2 handler use and (un)registration When registering the handlers, any state they rely on must be completely initialised first. When unregistering, we must wait until they are definitely no longer running. llc_rcv() must also avoid reading the handler pointers again after checking for NULL. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/llc/llc_input.c | 21 +++++++++++++++++---- net/llc/llc_station.c | 2 +- 2 files changed, 18 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index e32cab44ea95..dd3e83328ad5 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -42,6 +42,7 @@ static void (*llc_type_handlers[2])(struct llc_sap *sap, void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, struct sk_buff *skb)) { + smp_wmb(); /* ensure initialisation is complete before it's called */ if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = handler; } @@ -50,11 +51,19 @@ void llc_remove_pack(int type) { if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = NULL; + synchronize_net(); } void llc_set_station_handler(void (*handler)(struct sk_buff *skb)) { + /* Ensure initialisation is complete before it's called */ + if (handler) + smp_wmb(); + llc_station_handler = handler; + + if (!handler) + synchronize_net(); } /** @@ -150,6 +159,8 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, int dest; int (*rcv)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); + void (*sta_handler)(struct sk_buff *skb); + void (*sap_handler)(struct llc_sap *sap, struct sk_buff *skb); if (!net_eq(dev_net(dev), &init_net)) goto drop; @@ -182,7 +193,8 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, */ rcv = rcu_dereference(sap->rcv_func); dest = llc_pdu_type(skb); - if (unlikely(!dest || !llc_type_handlers[dest - 1])) { + sap_handler = dest ? ACCESS_ONCE(llc_type_handlers[dest - 1]) : NULL; + if (unlikely(!sap_handler)) { if (rcv) rcv(skb, dev, pt, orig_dev); else @@ -193,7 +205,7 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, if (cskb) rcv(cskb, dev, pt, orig_dev); } - llc_type_handlers[dest - 1](sap, skb); + sap_handler(sap, skb); } llc_sap_put(sap); out: @@ -202,9 +214,10 @@ drop: kfree_skb(skb); goto out; handle_station: - if (!llc_station_handler) + sta_handler = ACCESS_ONCE(llc_station_handler); + if (!sta_handler) goto drop; - llc_station_handler(skb); + sta_handler(skb); goto out; } diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c index bba5184fafd7..b2f2bac2c2a2 100644 --- a/net/llc/llc_station.c +++ b/net/llc/llc_station.c @@ -696,9 +696,9 @@ void __init llc_station_init(void) (unsigned long)&llc_main_station); llc_main_station.ack_timer.expires = jiffies + sysctl_llc_station_ack_timeout; - llc_set_station_handler(llc_station_rcv); llc_main_station.maximum_retry = 1; llc_main_station.state = LLC_STATION_STATE_UP; + llc_set_station_handler(llc_station_rcv); } void llc_station_exit(void) -- cgit From 4acd4945cd1e1f92b20d14e349c6c6a52acbd42d Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 14 Aug 2012 08:54:51 +0000 Subject: ipv6: addrconf: Avoid calling netdevice notifiers with RCU read-side lock Cong Wang reports that lockdep detected suspicious RCU usage while enabling IPV6 forwarding: [ 1123.310275] =============================== [ 1123.442202] [ INFO: suspicious RCU usage. ] [ 1123.558207] 3.6.0-rc1+ #109 Not tainted [ 1123.665204] ------------------------------- [ 1123.768254] include/linux/rcupdate.h:430 Illegal context switch in RCU read-side critical section! [ 1123.992320] [ 1123.992320] other info that might help us debug this: [ 1123.992320] [ 1124.307382] [ 1124.307382] rcu_scheduler_active = 1, debug_locks = 0 [ 1124.522220] 2 locks held by sysctl/5710: [ 1124.648364] #0: (rtnl_mutex){+.+.+.}, at: [] rtnl_trylock+0x15/0x17 [ 1124.882211] #1: (rcu_read_lock){.+.+.+}, at: [] rcu_lock_acquire+0x0/0x29 [ 1125.085209] [ 1125.085209] stack backtrace: [ 1125.332213] Pid: 5710, comm: sysctl Not tainted 3.6.0-rc1+ #109 [ 1125.441291] Call Trace: [ 1125.545281] [] lockdep_rcu_suspicious+0x109/0x112 [ 1125.667212] [] rcu_preempt_sleep_check+0x45/0x47 [ 1125.781838] [] __might_sleep+0x1e/0x19b [...] [ 1127.445223] [] call_netdevice_notifiers+0x4a/0x4f [...] [ 1127.772188] [] dev_disable_lro+0x32/0x6b [ 1127.885174] [] dev_forward_change+0x30/0xcb [ 1128.013214] [] addrconf_forward_change+0x85/0xc5 [...] addrconf_forward_change() uses RCU iteration over the netdev list, which is unnecessary since it already holds the RTNL lock. We also cannot reasonably require netdevice notifier functions not to sleep. Reported-by: Cong Wang Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 79181819a24f..6bc85f7c31e3 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -494,8 +494,7 @@ static void addrconf_forward_change(struct net *net, __s32 newf) struct net_device *dev; struct inet6_dev *idev; - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { + for_each_netdev(net, dev) { idev = __in6_dev_get(dev); if (idev) { int changed = (!idev->cnf.forwarding) ^ (!newf); @@ -504,7 +503,6 @@ static void addrconf_forward_change(struct net *net, __s32 newf) dev_forward_change(idev); } } - rcu_read_unlock(); } static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) -- cgit From c03307eab68d583ea6db917681afa14ed1fb3b84 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 14 Aug 2012 08:19:33 -0700 Subject: bridge: fix rcu dereference outside of rcu_read_lock Alternative solution for problem found by Linux Driver Verification project (linuxtesting.org). As it noted in the comment before the br_handle_frame_finish function, this function should be called under rcu_read_lock. The problem callgraph: br_dev_xmit -> br_nf_pre_routing_finish_bridge_slow -> -> br_handle_frame_finish -> br_port_get_rcu -> rcu_dereference And in this case there is no read-lock section. Reported-by: Denis Efremov Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 32211fa5b506..070e8a68cfc6 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -31,9 +31,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct net_bridge_mdb_entry *mdst; struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats); + rcu_read_lock(); #ifdef CONFIG_BRIDGE_NETFILTER if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { br_nf_pre_routing_finish_bridge_slow(skb); + rcu_read_unlock(); return NETDEV_TX_OK; } #endif @@ -48,7 +50,6 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb); skb_pull(skb, ETH_HLEN); - rcu_read_lock(); if (is_broadcast_ether_addr(dest)) br_flood_deliver(br, skb); else if (is_multicast_ether_addr(dest)) { -- cgit From e862f1a9b7df4e8196ebec45ac62295138aa3fc2 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:44 +0000 Subject: atm: fix info leak in getsockopt(SO_ATMPVC) The ATM code fails to initialize the two padding bytes of struct sockaddr_atmpvc inserted for alignment. Add an explicit memset(0) before filling the structure to avoid the info leak. Signed-off-by: Mathias Krause Signed-off-by: David S. Miller --- net/atm/common.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/atm/common.c b/net/atm/common.c index b4b44dbed645..0c0ad930a632 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -812,6 +812,7 @@ int vcc_getsockopt(struct socket *sock, int level, int optname, if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags)) return -ENOTCONN; + memset(&pvc, 0, sizeof(pvc)); pvc.sap_family = AF_ATMPVC; pvc.sap_addr.itf = vcc->dev->number; pvc.sap_addr.vpi = vcc->vpi; -- cgit From 3c0c5cfdcd4d69ffc4b9c0907cec99039f30a50a Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:45 +0000 Subject: atm: fix info leak via getsockname() The ATM code fails to initialize the two padding bytes of struct sockaddr_atmpvc inserted for alignment. Add an explicit memset(0) before filling the structure to avoid the info leak. Signed-off-by: Mathias Krause Signed-off-by: David S. Miller --- net/atm/pvc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 3a734919c36c..ae0324021407 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -95,6 +95,7 @@ static int pvc_getname(struct socket *sock, struct sockaddr *sockaddr, return -ENOTCONN; *sockaddr_len = sizeof(struct sockaddr_atmpvc); addr = (struct sockaddr_atmpvc *)sockaddr; + memset(addr, 0, sizeof(*addr)); addr->sap_family = AF_ATMPVC; addr->sap_addr.itf = vcc->dev->number; addr->sap_addr.vpi = vcc->vpi; -- cgit From e15ca9a0ef9a86f0477530b0f44a725d67f889ee Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:46 +0000 Subject: Bluetooth: HCI - Fix info leak in getsockopt(HCI_FILTER) The HCI code fails to initialize the two padding bytes of struct hci_ufilter before copying it to userland -- that for leaking two bytes kernel stack. Add an explicit memset(0) before filling the structure to avoid the info leak. Signed-off-by: Mathias Krause Cc: Marcel Holtmann Cc: Gustavo Padovan Cc: Johan Hedberg Signed-off-by: David S. Miller --- net/bluetooth/hci_sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index a7f04de03d79..a27bbc3cd4b7 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1009,6 +1009,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, { struct hci_filter *f = &hci_pi(sk)->filter; + memset(&uf, 0, sizeof(uf)); uf.type_mask = f->type_mask; uf.opcode = f->opcode; uf.event_mask[0] = *((u32 *) f->event_mask + 0); -- cgit From 3f68ba07b1da811bf383b4b701b129bfcb2e4988 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:47 +0000 Subject: Bluetooth: HCI - Fix info leak via getsockname() The HCI code fails to initialize the hci_channel member of struct sockaddr_hci and that for leaks two bytes kernel stack via the getsockname() syscall. Initialize hci_channel with 0 to avoid the info leak. Signed-off-by: Mathias Krause Cc: Marcel Holtmann Cc: Gustavo Padovan Cc: Johan Hedberg Signed-off-by: David S. Miller --- net/bluetooth/hci_sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index a27bbc3cd4b7..19fdac78e555 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -694,6 +694,7 @@ static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, *addr_len = sizeof(*haddr); haddr->hci_family = AF_BLUETOOTH; haddr->hci_dev = hdev->id; + haddr->hci_channel= 0; release_sock(sk); return 0; -- cgit From 9ad2de43f1aee7e7274a4e0d41465489299e344b Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:48 +0000 Subject: Bluetooth: RFCOMM - Fix info leak in getsockopt(BT_SECURITY) The RFCOMM code fails to initialize the key_size member of struct bt_security before copying it to userland -- that for leaking one byte kernel stack. Initialize key_size with 0 to avoid the info leak. Signed-off-by: Mathias Krause Cc: Marcel Holtmann Cc: Gustavo Padovan Cc: Johan Hedberg Signed-off-by: David S. Miller --- net/bluetooth/rfcomm/sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 7e1e59645c05..64f55ca61472 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -822,6 +822,7 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c } sec.level = rfcomm_pi(sk)->sec_level; + sec.key_size = 0; len = min_t(unsigned int, len, sizeof(sec)); if (copy_to_user(optval, (char *) &sec, len)) -- cgit From f9432c5ec8b1e9a09b9b0e5569e3c73db8de432a Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:49 +0000 Subject: Bluetooth: RFCOMM - Fix info leak in ioctl(RFCOMMGETDEVLIST) The RFCOMM code fails to initialize the two padding bytes of struct rfcomm_dev_list_req inserted for alignment before copying it to userland. Additionally there are two padding bytes in each instance of struct rfcomm_dev_info. The ioctl() that for disclosures two bytes plus dev_num times two bytes uninitialized kernel heap memory. Allocate the memory using kzalloc() to fix this issue. Signed-off-by: Mathias Krause Cc: Marcel Holtmann Cc: Gustavo Padovan Cc: Johan Hedberg Signed-off-by: David S. Miller --- net/bluetooth/rfcomm/tty.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index cb960773c002..56f182393c4c 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -456,7 +456,7 @@ static int rfcomm_get_dev_list(void __user *arg) size = sizeof(*dl) + dev_num * sizeof(*di); - dl = kmalloc(size, GFP_KERNEL); + dl = kzalloc(size, GFP_KERNEL); if (!dl) return -ENOMEM; -- cgit From 9344a972961d1a6d2c04d9008b13617bcb6ec2ef Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:50 +0000 Subject: Bluetooth: RFCOMM - Fix info leak via getsockname() The RFCOMM code fails to initialize the trailing padding byte of struct sockaddr_rc added for alignment. It that for leaks one byte kernel stack via the getsockname() syscall. Add an explicit memset(0) before filling the structure to avoid the info leak. Signed-off-by: Mathias Krause Cc: Marcel Holtmann Cc: Gustavo Padovan Cc: Johan Hedberg Signed-off-by: David S. Miller --- net/bluetooth/rfcomm/sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 64f55ca61472..1a17850d093c 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -528,6 +528,7 @@ static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int * BT_DBG("sock %p, sk %p", sock, sk); + memset(sa, 0, sizeof(*sa)); sa->rc_family = AF_BLUETOOTH; sa->rc_channel = rfcomm_pi(sk)->channel; if (peer) -- cgit From 792039c73cf176c8e39a6e8beef2c94ff46522ed Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:51 +0000 Subject: Bluetooth: L2CAP - Fix info leak via getsockname() The L2CAP code fails to initialize the l2_bdaddr_type member of struct sockaddr_l2 and the padding byte added for alignment. It that for leaks two bytes kernel stack via the getsockname() syscall. Add an explicit memset(0) before filling the structure to avoid the info leak. Signed-off-by: Mathias Krause Cc: Marcel Holtmann Cc: Gustavo Padovan Cc: Johan Hedberg Signed-off-by: David S. Miller --- net/bluetooth/l2cap_sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index b94abd30e6f9..1497edd191a2 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -245,6 +245,7 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, int *l BT_DBG("sock %p, sk %p", sock, sk); + memset(la, 0, sizeof(struct sockaddr_l2)); addr->sa_family = AF_BLUETOOTH; *len = sizeof(struct sockaddr_l2); -- cgit From 04d4fbca1017c11381e7d82acea21dd741e748bc Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:52 +0000 Subject: l2tp: fix info leak via getsockname() The L2TP code for IPv6 fails to initialize the l2tp_unused member of struct sockaddr_l2tpip6 and that for leaks two bytes kernel stack via the getsockname() syscall. Initialize l2tp_unused with 0 to avoid the info leak. Signed-off-by: Mathias Krause Cc: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_ip6.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 35e1e4bde587..927547171bc7 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -410,6 +410,7 @@ static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr, lsa->l2tp_family = AF_INET6; lsa->l2tp_flowinfo = 0; lsa->l2tp_scope_id = 0; + lsa->l2tp_unused = 0; if (peer) { if (!lsk->peer_conn_id) return -ENOTCONN; -- cgit From 3592aaeb80290bda0f2cf0b5456c97bfc638b192 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:53 +0000 Subject: llc: fix info leak via getsockname() The LLC code wrongly returns 0, i.e. "success", when the socket is zapped. Together with the uninitialized uaddrlen pointer argument from sys_getsockname this leads to an arbitrary memory leak of up to 128 bytes kernel stack via the getsockname() syscall. Return an error instead when the socket is zapped to prevent the info leak. Also remove the unnecessary memset(0). We don't directly write to the memory pointed by uaddr but memcpy() a local structure at the end of the function that is properly initialized. Signed-off-by: Mathias Krause Cc: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/llc/af_llc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 8c2919ca36f6..c2190005a114 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -969,14 +969,13 @@ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, struct sockaddr_llc sllc; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); - int rc = 0; + int rc = -EBADF; memset(&sllc, 0, sizeof(sllc)); lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) goto out; *uaddrlen = sizeof(sllc); - memset(uaddr, 0, *uaddrlen); if (peer) { rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) -- cgit From 276bdb82dedb290511467a5a4fdbe9f0b52dce6f Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:54 +0000 Subject: dccp: check ccid before dereferencing ccid_hc_rx_getsockopt() and ccid_hc_tx_getsockopt() might be called with a NULL ccid pointer leading to a NULL pointer dereference. This could lead to a privilege escalation if the attacker is able to map page 0 and prepare it with a fake ccid_ops pointer. Signed-off-by: Mathias Krause Cc: Gerrit Renker Cc: stable@vger.kernel.org Signed-off-by: David S. Miller --- net/dccp/ccid.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 75c3582a7678..fb85d371a8de 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -246,7 +246,7 @@ static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk, u32 __user *optval, int __user *optlen) { int rc = -ENOPROTOOPT; - if (ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) + if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len, optval, optlen); return rc; @@ -257,7 +257,7 @@ static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk, u32 __user *optval, int __user *optlen) { int rc = -ENOPROTOOPT; - if (ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) + if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len, optval, optlen); return rc; -- cgit From 7b07f8eb75aa3097cdfd4f6eac3da49db787381d Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:55 +0000 Subject: dccp: fix info leak via getsockopt(DCCP_SOCKOPT_CCID_TX_INFO) The CCID3 code fails to initialize the trailing padding bytes of struct tfrc_tx_info added for alignment on 64 bit architectures. It that for potentially leaks four bytes kernel stack via the getsockopt() syscall. Add an explicit memset(0) before filling the structure to avoid the info leak. Signed-off-by: Mathias Krause Cc: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid3.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index d65e98798eca..119c04317d48 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -535,6 +535,7 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, case DCCP_SOCKOPT_CCID_TX_INFO: if (len < sizeof(tfrc)) return -EINVAL; + memset(&tfrc, 0, sizeof(tfrc)); tfrc.tfrctx_x = hc->tx_x; tfrc.tfrctx_x_recv = hc->tx_x_recv; tfrc.tfrctx_x_calc = hc->tx_x_calc; -- cgit From 2d8a041b7bfe1097af21441cb77d6af95f4f4680 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:56 +0000 Subject: ipvs: fix info leak in getsockopt(IP_VS_SO_GET_TIMEOUT) If at least one of CONFIG_IP_VS_PROTO_TCP or CONFIG_IP_VS_PROTO_UDP is not set, __ip_vs_get_timeouts() does not fully initialize the structure that gets copied to userland and that for leaks up to 12 bytes of kernel stack. Add an explicit memset(0) before passing the structure to __ip_vs_get_timeouts() to avoid the info leak. Signed-off-by: Mathias Krause Cc: Wensong Zhang Cc: Simon Horman Cc: Julian Anastasov Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_ctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 84444dda194b..72bf32a84874 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2759,6 +2759,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { struct ip_vs_timeout_user t; + memset(&t, 0, sizeof(t)); __ip_vs_get_timeouts(net, &t); if (copy_to_user(user, &t, sizeof(t)) != 0) ret = -EFAULT; -- cgit From 43da5f2e0d0c69ded3d51907d9552310a6b545e8 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:57 +0000 Subject: net: fix info leak in compat dev_ifconf() The implementation of dev_ifconf() for the compat ioctl interface uses an intermediate ifc structure allocated in userland for the duration of the syscall. Though, it fails to initialize the padding bytes inserted for alignment and that for leaks four bytes of kernel stack. Add an explicit memset(0) before filling the structure to avoid the info leak. Signed-off-by: Mathias Krause Signed-off-by: David S. Miller --- net/socket.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index dfe5b66c97e0..a5471f804d99 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2657,6 +2657,7 @@ static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32) if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf))) return -EFAULT; + memset(&ifc, 0, sizeof(ifc)); if (ifc32.ifcbuf == 0) { ifc32.ifc_len = 0; ifc.ifc_len = 0; -- cgit From 2614f86490122bf51eb7c12ec73927f1900f4e7d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 16 Aug 2012 02:25:24 +0200 Subject: netfilter: nf_ct_expect: fix possible access to uninitialized timer In __nf_ct_expect_check, the function refresh_timer returns 1 if a matching expectation is found and its timer is successfully refreshed. This results in nf_ct_expect_related returning 0. Note that at this point: - the passed expectation is not inserted in the expectation table and its timer was not initialized, since we have refreshed one matching/existing expectation. - nf_ct_expect_alloc uses kmem_cache_alloc, so the expectation timer is in some undefined state just after the allocation, until it is appropriately initialized. This can be a problem for the SIP helper during the expectation addition: ... if (nf_ct_expect_related(rtp_exp) == 0) { if (nf_ct_expect_related(rtcp_exp) != 0) nf_ct_unexpect_related(rtp_exp); ... Note that nf_ct_expect_related(rtp_exp) may return 0 for the timer refresh case that is detailed above. Then, if nf_ct_unexpect_related(rtcp_exp) returns != 0, nf_ct_unexpect_related(rtp_exp) is called, which does: spin_lock_bh(&nf_conntrack_lock); if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } spin_unlock_bh(&nf_conntrack_lock); Note that del_timer always returns false if the timer has been initialized. However, the timer was not initialized since setup_timer was not called, therefore, the expectation timer remains in some undefined state. If I'm not missing anything, this may lead to the removal an unexistent expectation. To fix this, the optimization that allows refreshing an expectation is removed. Now nf_conntrack_expect_related looks more consistent to me since it always add the expectation in case that it returns success. Thanks to Patrick McHardy for participating in the discussion of this patch. I think this may be the source of the problem described by: http://marc.info/?l=netfilter-devel&m=134073514719421&w=2 Reported-by: Rafal Fitt Acked-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_expect.c | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 45cf602a76bc..527651a53a45 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -361,23 +361,6 @@ static void evict_oldest_expect(struct nf_conn *master, } } -static inline int refresh_timer(struct nf_conntrack_expect *i) -{ - struct nf_conn_help *master_help = nfct_help(i->master); - const struct nf_conntrack_expect_policy *p; - - if (!del_timer(&i->timeout)) - return 0; - - p = &rcu_dereference_protected( - master_help->helper, - lockdep_is_held(&nf_conntrack_lock) - )->expect_policy[i->class]; - i->timeout.expires = jiffies + p->timeout * HZ; - add_timer(&i->timeout); - return 1; -} - static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) { const struct nf_conntrack_expect_policy *p; @@ -386,7 +369,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(expect); - struct hlist_node *n; + struct hlist_node *n, *next; unsigned int h; int ret = 1; @@ -395,12 +378,12 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) goto out; } h = nf_ct_expect_dst_hash(&expect->tuple); - hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { + hlist_for_each_entry_safe(i, n, next, &net->ct.expect_hash[h], hnode) { if (expect_matches(i, expect)) { - /* Refresh timer: if it's dying, ignore.. */ - if (refresh_timer(i)) { - ret = 0; - goto out; + if (del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + nf_ct_expect_put(i); + break; } } else if (expect_clash(i, expect)) { ret = -EBUSY; -- cgit From 16c0b164bd24d44db137693a36b428ba28970c62 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 15 Aug 2012 20:44:27 +0000 Subject: act_mirred: do not drop packets when fails to mirror it We drop packet unconditionally when we fail to mirror it. This is not intended in some cases. Consdier for kvm guest, we may mirror the traffic of the bridge to a tap device used by a VM. When kernel fails to mirror the packet in conditions such as when qemu crashes or stop polling the tap, it's hard for the management software to detect such condition and clean the the mirroring before. This would lead all packets to the bridge to be dropped and break the netowrk of other virtual machines. To solve the issue, the patch does not drop packets when kernel fails to mirror it, and only drop the redirected packets. Signed-off-by: Jason Wang Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index fe81cc18e9e0..9c0fd0c78814 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -200,13 +200,12 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, out: if (err) { m->tcf_qstats.overlimits++; - /* should we be asking for packet to be dropped? - * may make sense for redirect case only - */ - retval = TC_ACT_SHOT; - } else { + if (m->tcfm_eaction != TCA_EGRESS_MIRROR) + retval = TC_ACT_SHOT; + else + retval = m->tcf_action; + } else retval = m->tcf_action; - } spin_unlock(&m->tcf_lock); return retval; -- cgit From f796c20cf67aa54c9130d0dc41307c0025719b85 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 14 Aug 2012 12:34:24 +0000 Subject: net: netprio: fix files lock and remove useless d_path bits Add lock to prevent a race with a file closing and also remove useless and ugly sscanf code. The extra code was never needed and the case it supposedly protected against is in fact handled correctly by sock_from_file as pointed out by Al Viro. CC: Neil Horman Reported-by: Al Viro Signed-off-by: John Fastabend Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index ed0c0431fcd8..f65dba3afd99 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -277,12 +277,6 @@ out_free_devname: void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *p; - char *tmp = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); - - if (!tmp) { - pr_warn("Unable to attach cgrp due to alloc failure!\n"); - return; - } cgroup_taskset_for_each(p, cgrp, tset) { unsigned int fd; @@ -296,32 +290,24 @@ void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) continue; } - rcu_read_lock(); + spin_lock(&files->file_lock); fdt = files_fdtable(files); for (fd = 0; fd < fdt->max_fds; fd++) { - char *path; struct file *file; struct socket *sock; - unsigned long s; - int rv, err = 0; + int err; file = fcheck_files(files, fd); if (!file) continue; - path = d_path(&file->f_path, tmp, PAGE_SIZE); - rv = sscanf(path, "socket:[%lu]", &s); - if (rv <= 0) - continue; - sock = sock_from_file(file, &err); - if (!err) + if (sock) sock_update_netprioidx(sock->sk, p); } - rcu_read_unlock(); + spin_unlock(&files->file_lock); task_unlock(p); } - kfree(tmp); } static struct cftype ss_files[] = { -- cgit From 48a87cc26c13b68f6cce4e9d769fcb17a6b3e4b8 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 14 Aug 2012 12:34:30 +0000 Subject: net: netprio: fd passed in SCM_RIGHTS datagram not set correctly A socket fd passed in a SCM_RIGHTS datagram was not getting updated with the new tasks cgrp prioidx. This leaves IO on the socket tagged with the old tasks priority. To fix this add a check in the scm recvmsg path to update the sock cgrp prioidx with the new tasks value. Thanks to Al Viro for catching this. CC: Neil Horman Reported-by: Al Viro Signed-off-by: John Fastabend Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/scm.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/scm.c b/net/core/scm.c index 8f6ccfd68ef4..040cebeed45b 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -265,6 +265,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); isk, current); fd_install(new_fd, fp[i]); } -- cgit From 476ad154f3b41dd7d9a08a2f641e28388abc2fd1 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 14 Aug 2012 12:34:35 +0000 Subject: net: netprio: fix cgrp create and write priomap race A race exists where creating cgroups and also updating the priomap may result in losing a priomap update. This is because priomap writers are not protected by rtnl_lock. Move priority writer into rtnl_lock()/rtnl_unlock(). CC: Neil Horman Reported-by: Al Viro Signed-off-by: John Fastabend Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index f65dba3afd99..c75e3f9d060f 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -101,12 +101,10 @@ static int write_update_netdev_table(struct net_device *dev) u32 max_len; struct netprio_map *map; - rtnl_lock(); max_len = atomic_read(&max_prioidx) + 1; map = rtnl_dereference(dev->priomap); if (!map || map->priomap_len < max_len) ret = extend_netdev_table(dev, max_len); - rtnl_unlock(); return ret; } @@ -256,17 +254,17 @@ static int write_priomap(struct cgroup *cgrp, struct cftype *cft, if (!dev) goto out_free_devname; + rtnl_lock(); ret = write_update_netdev_table(dev); if (ret < 0) goto out_put_dev; - rcu_read_lock(); - map = rcu_dereference(dev->priomap); + map = rtnl_dereference(dev->priomap); if (map) map->priomap[prioidx] = priority; - rcu_read_unlock(); out_put_dev: + rtnl_unlock(); dev_put(dev); out_free_devname: -- cgit From c0de08d04215031d68fa13af36f347a6cfa252ca Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Thu, 16 Aug 2012 22:02:58 +0000 Subject: af_packet: don't emit packet on orig fanout group If a packet is emitted on one socket in one group of fanout sockets, it is transmitted again. It is thus read again on one of the sockets of the fanout group. This result in a loop for software which generate packets when receiving one. This retransmission is not the intended behavior: a fanout group must behave like a single socket. The packet should not be transmitted on a socket if it originates from a socket belonging to the same fanout group. This patch fixes the issue by changing the transmission check to take fanout group info account. Reported-by: Aleksandr Kotov Signed-off-by: Eric Leblond Signed-off-by: David S. Miller --- net/core/dev.c | 16 ++++++++++++++-- net/packet/af_packet.c | 9 +++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index a39354ee1432..debd9372472f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1642,6 +1642,19 @@ static inline int deliver_skb(struct sk_buff *skb, return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } +static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) +{ + if (ptype->af_packet_priv == NULL) + return false; + + if (ptype->id_match) + return ptype->id_match(ptype, skb->sk); + else if ((struct sock *)ptype->af_packet_priv == skb->sk) + return true; + + return false; +} + /* * Support routine. Sends outgoing frames to any network * taps currently in use. @@ -1659,8 +1672,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) * they originated from - MvS (miquels@drinkel.ow.org) */ if ((ptype->dev == dev || !ptype->dev) && - (ptype->af_packet_priv == NULL || - (struct sock *)ptype->af_packet_priv != skb->sk)) { + (!skb_loop_sk(ptype, skb))) { if (pt_prev) { deliver_skb(skb2, pt_prev, skb->dev); pt_prev = ptype; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8ac890a1a4c0..aee7196aac36 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1273,6 +1273,14 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po) spin_unlock(&f->lock); } +bool match_fanout_group(struct packet_type *ptype, struct sock * sk) +{ + if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout) + return true; + + return false; +} + static int fanout_add(struct sock *sk, u16 id, u16 type_flags) { struct packet_sock *po = pkt_sk(sk); @@ -1325,6 +1333,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; + match->prot_hook.id_match = match_fanout_group; dev_add_pack(&match->prot_hook); list_add(&match->list, &fanout_list); } -- cgit From d92c7f8aabae913de16eb855b19cd2002c341896 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Fri, 17 Aug 2012 10:33:12 +0000 Subject: caif: Do not dereference NULL in chnl_recv_cb() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In net/caif/chnl_net.c::chnl_recv_cb() we call skb_header_pointer() which may return NULL, but we do not check for a NULL pointer before dereferencing it. This patch adds such a NULL check and properly free's allocated memory and return an error (-EINVAL) on failure - much better than crashing.. Signed-off-by: Jesper Juhl Acked-by: Sjur Brændeland Signed-off-by: David S. Miller --- net/caif/chnl_net.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 69771c04ba8f..e597733affb8 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -94,6 +94,10 @@ static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) /* check the version of IP */ ip_version = skb_header_pointer(skb, 0, 1, &buf); + if (!ip_version) { + kfree_skb(skb); + return -EINVAL; + } switch (*ip_version >> 4) { case 4: -- cgit From 9d7b0fc1ef1f17aff57c0dc9a59453d8fca255c3 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 20 Aug 2012 02:56:56 -0700 Subject: net: ipv6: fix oops in inet_putpeer() Commit 97bab73f (inet: Hide route peer accesses behind helpers.) introduced a bug in xfrm6_policy_destroy(). The xfrm_dst's _rt6i_peer member is not initialized, causing a false positive result from inetpeer_ptr_is_peer(), which in turn causes a NULL pointer dereference in inet_putpeer(). Pid: 314, comm: kworker/0:1 Not tainted 3.6.0-rc1+ #17 To Be Filled By O.E.M. To Be Filled By O.E.M./P4S800D-X EIP: 0060:[] EFLAGS: 00010246 CPU: 0 EIP is at inet_putpeer+0xe/0x16 EAX: 00000000 EBX: f3481700 ECX: 00000000 EDX: 000dd641 ESI: f3481700 EDI: c05e949c EBP: f551def4 ESP: f551def4 DS: 007b ES: 007b FS: 0000 GS: 00e0 SS: 0068 CR0: 8005003b CR2: 00000070 CR3: 3243d000 CR4: 00000750 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 f551df04 c0423de1 00000000 f3481700 f551df18 c038d5f7 f254b9f8 f551df28 f34f85d8 f551df20 c03ef48d f551df3c c0396870 f30697e8 f24e1738 c05e98f4 f5509540 c05cd2b4 f551df7c c0142d2b c043feb5 f5509540 00000000 c05cd2e8 [] xfrm6_dst_destroy+0x42/0xdb [] dst_destroy+0x1d/0xa4 [] xfrm_bundle_flo_delete+0x2b/0x36 [] flow_cache_gc_task+0x85/0x9f [] process_one_work+0x122/0x441 [] ? apic_timer_interrupt+0x31/0x38 [] ? flow_cache_new_hashrnd+0x2b/0x2b [] worker_thread+0x113/0x3cc Fix by adding a init_dst() callback to struct xfrm_policy_afinfo to properly initialize the dst's peer pointer. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv6/xfrm6_policy.c | 8 ++++++++ net/xfrm/xfrm_policy.c | 2 ++ 2 files changed, 10 insertions(+) (limited to 'net') diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index ef39812107b1..f8c4c08ffb60 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -73,6 +73,13 @@ static int xfrm6_get_tos(const struct flowi *fl) return 0; } +static void xfrm6_init_dst(struct net *net, struct xfrm_dst *xdst) +{ + struct rt6_info *rt = (struct rt6_info *)xdst; + + rt6_init_peer(rt, net->ipv6.peers); +} + static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { @@ -286,6 +293,7 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .get_saddr = xfrm6_get_saddr, .decode_session = _decode_session6, .get_tos = xfrm6_get_tos, + .init_dst = xfrm6_init_dst, .init_path = xfrm6_init_path, .fill_dst = xfrm6_fill_dst, .blackhole_route = ip6_blackhole_route, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c5a5165a5927..5a2aa17e4d3c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1357,6 +1357,8 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst)); xdst->flo.ops = &xfrm_bundle_fc_ops; + if (afinfo->init_dst) + afinfo->init_dst(net, xdst); } else xdst = ERR_PTR(-ENOBUFS); -- cgit From 3de7a37b02f607716753dc669c1dca4f71db08fc Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Aug 2012 14:36:44 +0000 Subject: net/core/dev.c: fix kernel-doc warning Fix kernel-doc warning: Warning(net/core/dev.c:5745): No description found for parameter 'dev' Signed-off-by: Randy Dunlap Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index debd9372472f..83988362805e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5744,6 +5744,7 @@ EXPORT_SYMBOL(netdev_refcnt_read); /** * netdev_wait_allrefs - wait until all references are gone. + * @dev: target net_device * * This is called when unregistering network devices. * -- cgit From fae6ef87faeb8853896920c68ee703d715799d28 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sun, 19 Aug 2012 03:30:38 +0000 Subject: net: tcp: move sk_rx_dst_set call after tcp_create_openreq_child() This commit removes the sk_rx_dst_set calls from tcp_create_openreq_child(), because at that point the icsk_af_ops field of ipv6_mapped TCP sockets has not been set to its proper final value. Instead, to make sure we get the right sk_rx_dst_set variant appropriate for the address family of the new connection, we have tcp_v{4,6}_syn_recv_sock() directly call the appropriate function shortly after the call to tcp_create_openreq_child() returns. This also moves inet6_sk_rx_dst_set() to avoid a forward declaration with the new approach. Signed-off-by: Neal Cardwell Reported-by: Artem Savkov Cc: Eric Dumazet Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 2 -- net/ipv6/tcp_ipv6.c | 25 +++++++++++++------------ 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 767823764016..5bf2040b25b1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1462,6 +1462,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, goto exit_nonewsk; newsk->sk_gso_type = SKB_GSO_TCPV4; + inet_sk_rx_dst_set(newsk, skb); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index d9c9dcef2de3..6ff7f10dce9d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -387,8 +387,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_sock *oldtp = tcp_sk(sk); struct tcp_cookie_values *oldcvp = oldtp->cookie_values; - newicsk->icsk_af_ops->sk_rx_dst_set(newsk, skb); - /* TCP Cookie Transactions require space for the cookie pair, * as it differs for each connection. There is no need to * copy any s_data_payload stored at the original socket. diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index bb9ce2b2f377..a3e60cc04a8a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -94,6 +94,18 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, } #endif +static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct rt6_info *rt = (const struct rt6_info *)dst; + + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + if (rt->rt6i_node) + inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; +} + static void tcp_v6_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { @@ -1270,6 +1282,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->sk_gso_type = SKB_GSO_TCPV6; __ip6_dst_store(newsk, dst, NULL, NULL); + inet6_sk_rx_dst_set(newsk, skb); newtcp6sk = (struct tcp6_sock *)newsk; inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; @@ -1729,18 +1742,6 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_destructor= tcp_twsk_destructor, }; -static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) -{ - struct dst_entry *dst = skb_dst(skb); - const struct rt6_info *rt = (const struct rt6_info *)dst; - - dst_hold(dst); - sk->sk_rx_dst = dst; - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - if (rt->rt6i_node) - inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; -} - static const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, -- cgit From d1c338a509cea5378df59629ad47382810c38623 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 19 Aug 2012 12:29:16 -0700 Subject: libceph: delay debugfs initialization until we learn global_id The debugfs directory includes the cluster fsid and our unique global_id. We need to delay the initialization of the debug entry until we have learned both the fsid and our global_id from the monitor or else the second client can't create its debugfs entry and will fail (and multiple client instances aren't properly reflected in debugfs). Reported by: Yan, Zheng Signed-off-by: Sage Weil Reviewed-by: Yehuda Sadeh --- net/ceph/ceph_common.c | 1 - net/ceph/debugfs.c | 4 ++++ net/ceph/mon_client.c | 51 +++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 50 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 69e38db28e5f..a8020293f342 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -84,7 +84,6 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) return -1; } } else { - pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid); memcpy(&client->fsid, fsid, sizeof(*fsid)); } return 0; diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 54b531a01121..38b5dc1823d4 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -189,6 +189,9 @@ int ceph_debugfs_client_init(struct ceph_client *client) snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, client->monc.auth->global_id); + dout("ceph_debugfs_client_init %p %s\n", client, name); + + BUG_ON(client->debugfs_dir); client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); if (!client->debugfs_dir) goto out; @@ -234,6 +237,7 @@ out: void ceph_debugfs_client_cleanup(struct ceph_client *client) { + dout("ceph_debugfs_client_cleanup %p\n", client); debugfs_remove(client->debugfs_osdmap); debugfs_remove(client->debugfs_monmap); debugfs_remove(client->osdc.debugfs_file); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 105d533b55f3..900ea0f043fc 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -310,6 +310,17 @@ int ceph_monc_open_session(struct ceph_mon_client *monc) } EXPORT_SYMBOL(ceph_monc_open_session); +/* + * We require the fsid and global_id in order to initialize our + * debugfs dir. + */ +static bool have_debugfs_info(struct ceph_mon_client *monc) +{ + dout("have_debugfs_info fsid %d globalid %lld\n", + (int)monc->client->have_fsid, monc->auth->global_id); + return monc->client->have_fsid && monc->auth->global_id > 0; +} + /* * The monitor responds with mount ack indicate mount success. The * included client ticket allows the client to talk to MDSs and OSDs. @@ -320,9 +331,12 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, struct ceph_client *client = monc->client; struct ceph_monmap *monmap = NULL, *old = monc->monmap; void *p, *end; + int had_debugfs_info, init_debugfs = 0; mutex_lock(&monc->mutex); + had_debugfs_info = have_debugfs_info(monc); + dout("handle_monmap\n"); p = msg->front.iov_base; end = p + msg->front.iov_len; @@ -344,12 +358,22 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, if (!client->have_fsid) { client->have_fsid = true; + if (!had_debugfs_info && have_debugfs_info(monc)) { + pr_info("client%lld fsid %pU\n", + ceph_client_id(monc->client), + &monc->client->fsid); + init_debugfs = 1; + } mutex_unlock(&monc->mutex); - /* - * do debugfs initialization without mutex to avoid - * creating a locking dependency - */ - ceph_debugfs_client_init(client); + + if (init_debugfs) { + /* + * do debugfs initialization without mutex to avoid + * creating a locking dependency + */ + ceph_debugfs_client_init(monc->client); + } + goto out_unlocked; } out: @@ -865,8 +889,10 @@ static void handle_auth_reply(struct ceph_mon_client *monc, { int ret; int was_auth = 0; + int had_debugfs_info, init_debugfs = 0; mutex_lock(&monc->mutex); + had_debugfs_info = have_debugfs_info(monc); if (monc->auth->ops) was_auth = monc->auth->ops->is_authenticated(monc->auth); monc->pending_auth = 0; @@ -889,7 +915,22 @@ static void handle_auth_reply(struct ceph_mon_client *monc, __send_subscribe(monc); __resend_generic_request(monc); } + + if (!had_debugfs_info && have_debugfs_info(monc)) { + pr_info("client%lld fsid %pU\n", + ceph_client_id(monc->client), + &monc->client->fsid); + init_debugfs = 1; + } mutex_unlock(&monc->mutex); + + if (init_debugfs) { + /* + * do debugfs initialization without mutex to avoid + * creating a locking dependency + */ + ceph_debugfs_client_init(monc->client); + } } static int __validate_auth(struct ceph_mon_client *monc) -- cgit From 144d56e91044181ec0ef67aeca91e9a8b5718348 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 20 Aug 2012 00:22:46 +0000 Subject: tcp: fix possible socket refcount problem Commit 6f458dfb40 (tcp: improve latencies of timer triggered events) added bug leading to following trace : [ 2866.131281] IPv4: Attempt to release TCP socket in state 1 ffff880019ec0000 [ 2866.131726] [ 2866.132188] ========================= [ 2866.132281] [ BUG: held lock freed! ] [ 2866.132281] 3.6.0-rc1+ #622 Not tainted [ 2866.132281] ------------------------- [ 2866.132281] kworker/0:1/652 is freeing memory ffff880019ec0000-ffff880019ec0a1f, with a lock still held there! [ 2866.132281] (sk_lock-AF_INET-RPC){+.+...}, at: [] tcp_sendmsg+0x29/0xcc6 [ 2866.132281] 4 locks held by kworker/0:1/652: [ 2866.132281] #0: (rpciod){.+.+.+}, at: [] process_one_work+0x1de/0x47f [ 2866.132281] #1: ((&task->u.tk_work)){+.+.+.}, at: [] process_one_work+0x1de/0x47f [ 2866.132281] #2: (sk_lock-AF_INET-RPC){+.+...}, at: [] tcp_sendmsg+0x29/0xcc6 [ 2866.132281] #3: (&icsk->icsk_retransmit_timer){+.-...}, at: [] run_timer_softirq+0x1ad/0x35f [ 2866.132281] [ 2866.132281] stack backtrace: [ 2866.132281] Pid: 652, comm: kworker/0:1 Not tainted 3.6.0-rc1+ #622 [ 2866.132281] Call Trace: [ 2866.132281] [] debug_check_no_locks_freed+0x112/0x159 [ 2866.132281] [] ? __sk_free+0xfd/0x114 [ 2866.132281] [] kmem_cache_free+0x6b/0x13a [ 2866.132281] [] __sk_free+0xfd/0x114 [ 2866.132281] [] sk_free+0x1c/0x1e [ 2866.132281] [] tcp_write_timer+0x51/0x56 [ 2866.132281] [] run_timer_softirq+0x218/0x35f [ 2866.132281] [] ? run_timer_softirq+0x1ad/0x35f [ 2866.132281] [] ? rb_commit+0x58/0x85 [ 2866.132281] [] ? tcp_write_timer_handler+0x148/0x148 [ 2866.132281] [] __do_softirq+0xcb/0x1f9 [ 2866.132281] [] ? _raw_spin_unlock+0x29/0x2e [ 2866.132281] [] call_softirq+0x1c/0x30 [ 2866.132281] [] do_softirq+0x4a/0xa6 [ 2866.132281] [] irq_exit+0x51/0xad [ 2866.132281] [] do_IRQ+0x9d/0xb4 [ 2866.132281] [] common_interrupt+0x6f/0x6f [ 2866.132281] [] ? sched_clock_cpu+0x58/0xd1 [ 2866.132281] [] ? _raw_spin_unlock_irqrestore+0x4c/0x56 [ 2866.132281] [] mod_timer+0x178/0x1a9 [ 2866.132281] [] sk_reset_timer+0x19/0x26 [ 2866.132281] [] tcp_rearm_rto+0x99/0xa4 [ 2866.132281] [] tcp_event_new_data_sent+0x6e/0x70 [ 2866.132281] [] tcp_write_xmit+0x7de/0x8e4 [ 2866.132281] [] ? __alloc_skb+0xa0/0x1a1 [ 2866.132281] [] __tcp_push_pending_frames+0x2e/0x8a [ 2866.132281] [] tcp_sendmsg+0xb32/0xcc6 [ 2866.132281] [] inet_sendmsg+0xaa/0xd5 [ 2866.132281] [] ? inet_autobind+0x5f/0x5f [ 2866.132281] [] ? trace_clock_local+0x9/0xb [ 2866.132281] [] sock_sendmsg+0xa3/0xc4 [ 2866.132281] [] ? rb_reserve_next_event+0x26f/0x2d5 [ 2866.132281] [] ? native_sched_clock+0x29/0x6f [ 2866.132281] [] ? sched_clock+0x9/0xd [ 2866.132281] [] ? trace_clock_local+0x9/0xb [ 2866.132281] [] kernel_sendmsg+0x37/0x43 [ 2866.132281] [] xs_send_kvec+0x77/0x80 [ 2866.132281] [] xs_sendpages+0x6f/0x1a0 [ 2866.132281] [] ? try_to_del_timer_sync+0x55/0x61 [ 2866.132281] [] xs_tcp_send_request+0x55/0xf1 [ 2866.132281] [] xprt_transmit+0x89/0x1db [ 2866.132281] [] ? call_connect+0x3c/0x3c [ 2866.132281] [] call_transmit+0x1c5/0x20e [ 2866.132281] [] __rpc_execute+0x6f/0x225 [ 2866.132281] [] ? call_connect+0x3c/0x3c [ 2866.132281] [] rpc_async_schedule+0x28/0x34 [ 2866.132281] [] process_one_work+0x24d/0x47f [ 2866.132281] [] ? process_one_work+0x1de/0x47f [ 2866.132281] [] ? __rpc_execute+0x225/0x225 [ 2866.132281] [] worker_thread+0x236/0x317 [ 2866.132281] [] ? process_scheduled_works+0x2f/0x2f [ 2866.132281] [] kthread+0x9a/0xa2 [ 2866.132281] [] kernel_thread_helper+0x4/0x10 [ 2866.132281] [] ? retint_restore_args+0x13/0x13 [ 2866.132281] [] ? __init_kthread_worker+0x5a/0x5a [ 2866.132281] [] ? gs_change+0x13/0x13 [ 2866.308506] IPv4: Attempt to release TCP socket in state 1 ffff880019ec0000 [ 2866.309689] ============================================================================= [ 2866.310254] BUG TCP (Not tainted): Object already free [ 2866.310254] ----------------------------------------------------------------------------- [ 2866.310254] The bug comes from the fact that timer set in sk_reset_timer() can run before we actually do the sock_hold(). socket refcount reaches zero and we free the socket too soon. timer handler is not allowed to reduce socket refcnt if socket is owned by the user, or we need to change sk_reset_timer() implementation. We should take a reference on the socket in case TCP_DELACK_TIMER_DEFERRED or TCP_DELACK_TIMER_DEFERRED bit are set in tsq_flags Also fix a typo in tcp_delack_timer(), where TCP_WRITE_TIMER_DEFERRED was used instead of TCP_DELACK_TIMER_DEFERRED. For consistency, use same socket refcount change for TCP_MTU_REDUCED_DEFERRED, even if not fired from a timer. Reported-by: Fengguang Wu Tested-by: Fengguang Wu Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 8 +++++--- net/ipv4/tcp_output.c | 14 +++++++++----- net/ipv4/tcp_timer.c | 6 ++++-- 3 files changed, 18 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5bf2040b25b1..00a748d14062 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -417,10 +417,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ tp->mtu_info = info; - if (!sock_owned_by_user(sk)) + if (!sock_owned_by_user(sk)) { tcp_v4_mtu_reduced(sk); - else - set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); + } else { + if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) + sock_hold(sk); + } goto out; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 20dfd892c86f..d04632673a9e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -910,14 +910,18 @@ void tcp_release_cb(struct sock *sk) if (flags & (1UL << TCP_TSQ_DEFERRED)) tcp_tsq_handler(sk); - if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) + if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { tcp_write_timer_handler(sk); - - if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) + __sock_put(sk); + } + if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) { tcp_delack_timer_handler(sk); - - if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) + __sock_put(sk); + } + if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { sk->sk_prot->mtu_reduced(sk); + __sock_put(sk); + } } EXPORT_SYMBOL(tcp_release_cb); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 6df36ad55a38..b774a03bd1dc 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -252,7 +252,8 @@ static void tcp_delack_timer(unsigned long data) inet_csk(sk)->icsk_ack.blocked = 1; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ - set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags); + if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) + sock_hold(sk); } bh_unlock_sock(sk); sock_put(sk); @@ -481,7 +482,8 @@ static void tcp_write_timer(unsigned long data) tcp_write_timer_handler(sk); } else { /* deleguate our work to tcp_release_cb() */ - set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags); + if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) + sock_hold(sk); } bh_unlock_sock(sk); sock_put(sk); -- cgit From 1a7b27c97ce675b42eeb7bfaf6e15c34f35c8f95 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Mon, 20 Aug 2012 02:52:09 +0000 Subject: ipv4: Use newinet->inet_opt in inet_csk_route_child_sock() Since 0e734419923bd ("ipv4: Use inet_csk_route_child_sock() in DCCP and TCP."), inet_csk_route_child_sock() is called instead of inet_csk_route_req(). However, after creating the child-sock in tcp/dccp_v4_syn_recv_sock(), ireq->opt is set to NULL, before calling inet_csk_route_child_sock(). Thus, inside inet_csk_route_child_sock() opt is always NULL and the SRR-options are not respected anymore. Packets sent by the server won't have the correct destination-IP. This patch fixes it by accessing newinet->inet_opt instead of ireq->opt inside inet_csk_route_child_sock(). Reported-by: Luca Boccassi Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index db0cf17c00f7..7f75f21d7b83 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -404,12 +404,15 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, { const struct inet_request_sock *ireq = inet_rsk(req); struct inet_sock *newinet = inet_sk(newsk); - struct ip_options_rcu *opt = ireq->opt; + struct ip_options_rcu *opt; struct net *net = sock_net(sk); struct flowi4 *fl4; struct rtable *rt; fl4 = &newinet->cork.fl.u.ip4; + + rcu_read_lock(); + opt = rcu_dereference(newinet->inet_opt); flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), @@ -421,11 +424,13 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_gateway) goto route_err; + rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: + rcu_read_unlock(); IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } -- cgit From a9915a1b52df52ad87f3b33422da95cf25372f09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 20 Aug 2012 07:26:45 +0000 Subject: ipv4: fix ip header ident selection in __ip_make_skb() Christian Casteyde reported a kmemcheck 32-bit read from uninitialized memory in __ip_select_ident(). It turns out that __ip_make_skb() called ip_select_ident() before properly initializing iph->daddr. This is a bug uncovered by commit 1d861aa4b3fb (inet: Minimize use of cached route inetpeer.) Addresses https://bugzilla.kernel.org/show_bug.cgi?id=46131 Reported-by: Christian Casteyde Signed-off-by: Eric Dumazet Cc: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 147ccc3e93db..c196d749daf2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1338,10 +1338,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk, iph->ihl = 5; iph->tos = inet->tos; iph->frag_off = df; - ip_select_ident(iph, &rt->dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; ip_copy_addrs(iph, fl4); + ip_select_ident(iph, &rt->dst, sk); if (opt) { iph->ihl += opt->optlen>>2; -- cgit From e0e3cea46d31d23dc40df0a49a7a2c04fe8edfea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Aug 2012 06:21:17 +0000 Subject: af_netlink: force credentials passing [CVE-2012-3520] Pablo Neira Ayuso discovered that avahi and potentially NetworkManager accept spoofed Netlink messages because of a kernel bug. The kernel passes all-zero SCM_CREDENTIALS ancillary data to the receiver if the sender did not provide such data, instead of not including any such data at all or including the correct data from the peer (as it is the case with AF_UNIX). This bug was introduced in commit 16e572626961 (af_unix: dont send SCM_CREDENTIALS by default) This patch forces passing credentials for netlink, as before the regression. Another fix would be to not add SCM_CREDENTIALS in netlink messages if not provided by the sender, but it might break some programs. With help from Florian Weimer & Petr Matousek This issue is designated as CVE-2012-3520 Signed-off-by: Eric Dumazet Cc: Petr Matousek Cc: Florian Weimer Cc: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 2 +- net/unix/af_unix.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5463969da45b..1445d73533ed 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1362,7 +1362,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &scm; - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, true); if (err < 0) return err; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e4768c180da2..c5ee4ff61364 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1450,7 +1450,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &tmp_scm; wait_for_unix_gc(); - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, false); if (err < 0) return err; @@ -1619,7 +1619,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &tmp_scm; wait_for_unix_gc(); - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, false); if (err < 0) return err; -- cgit From 6d4221b53707486dfad3f5bfe568d2ce7f4c9863 Mon Sep 17 00:00:00 2001 From: Jim Schutt Date: Fri, 10 Aug 2012 10:37:38 -0700 Subject: libceph: avoid truncation due to racing banners Because the Ceph client messenger uses a non-blocking connect, it is possible for the sending of the client banner to race with the arrival of the banner sent by the peer. When ceph_sock_state_change() notices the connect has completed, it schedules work to process the socket via con_work(). During this time the peer is writing its banner, and arrival of the peer banner races with con_work(). If con_work() calls try_read() before the peer banner arrives, there is nothing for it to do, after which con_work() calls try_write() to send the client's banner. In this case Ceph's protocol negotiation can complete succesfully. The server-side messenger immediately sends its banner and addresses after accepting a connect request, *before* actually attempting to read or verify the banner from the client. As a result, it is possible for the banner from the server to arrive before con_work() calls try_read(). If that happens, try_read() will read the banner and prepare protocol negotiation info via prepare_write_connect(). prepare_write_connect() calls con_out_kvec_reset(), which discards the as-yet-unsent client banner. Next, con_work() calls try_write(), which sends the protocol negotiation info rather than the banner that the peer is expecting. The result is that the peer sees an invalid banner, and the client reports "negotiation failed". Fix this by moving con_out_kvec_reset() out of prepare_write_connect() to its callers at all locations except the one where the banner might still need to be sent. [elder@inktak.com: added note about server-side behavior] Signed-off-by: Jim Schutt Reviewed-by: Alex Elder --- net/ceph/messenger.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b9796750034a..24c5eea8c45b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -915,7 +915,6 @@ static int prepare_write_connect(struct ceph_connection *con) con->out_connect.authorizer_len = auth ? cpu_to_le32(auth->authorizer_buf_len) : 0; - con_out_kvec_reset(con); con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect); if (auth && auth->authorizer_buf_len) @@ -1557,6 +1556,7 @@ static int process_connect(struct ceph_connection *con) return -1; } con->auth_retry = 1; + con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1577,6 +1577,7 @@ static int process_connect(struct ceph_connection *con) ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr)); reset_connection(con); + con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1601,6 +1602,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->out_connect.connect_seq), le32_to_cpu(con->in_reply.connect_seq)); con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); + con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1617,6 +1619,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_reply.global_seq)); get_global_seq(con->msgr, le32_to_cpu(con->in_reply.global_seq)); + con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -2135,7 +2138,11 @@ more: BUG_ON(con->state != CON_STATE_CONNECTING); con->state = CON_STATE_NEGOTIATING; - /* Banner is good, exchange connection info */ + /* + * Received banner is good, exchange connection info. + * Do not reset out_kvec, as sending our banner raced + * with receiving peer banner after connect completed. + */ ret = prepare_write_connect(con); if (ret < 0) goto out; -- cgit