From c7262e711ae6e466baeb9ddc21d678c878469b1f Mon Sep 17 00:00:00 2001
From: Johan Hedberg <johan.hedberg@intel.com>
Date: Tue, 17 Jun 2014 13:07:37 +0300
Subject: Bluetooth: Fix overriding higher security level in SMP

When we receive a pairing request or an internal request to start
pairing we shouldn't blindly overwrite the existing pending_sec_level
value as that may actually be higher than the new one. This patch fixes
the SMP code to only overwrite the value in case the new one is higher
than the old.

Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/smp.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index f2829a7932e2..0189ec8b68d1 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -669,7 +669,7 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
 {
 	struct smp_cmd_pairing rsp, *req = (void *) skb->data;
 	struct smp_chan *smp;
-	u8 key_size, auth;
+	u8 key_size, auth, sec_level;
 	int ret;
 
 	BT_DBG("conn %p", conn);
@@ -695,7 +695,9 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
 	/* We didn't start the pairing, so match remote */
 	auth = req->auth_req;
 
-	conn->hcon->pending_sec_level = authreq_to_seclevel(auth);
+	sec_level = authreq_to_seclevel(auth);
+	if (sec_level > conn->hcon->pending_sec_level)
+		conn->hcon->pending_sec_level = sec_level;
 
 	build_pairing_cmd(conn, req, &rsp, auth);
 
@@ -838,6 +840,7 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb)
 	struct smp_cmd_pairing cp;
 	struct hci_conn *hcon = conn->hcon;
 	struct smp_chan *smp;
+	u8 sec_level;
 
 	BT_DBG("conn %p", conn);
 
@@ -847,7 +850,9 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb)
 	if (!(conn->hcon->link_mode & HCI_LM_MASTER))
 		return SMP_CMD_NOTSUPP;
 
-	hcon->pending_sec_level = authreq_to_seclevel(rp->auth_req);
+	sec_level = authreq_to_seclevel(rp->auth_req);
+	if (sec_level > hcon->pending_sec_level)
+		hcon->pending_sec_level = sec_level;
 
 	if (smp_ltk_encrypt(conn, hcon->pending_sec_level))
 		return 0;
@@ -901,9 +906,12 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
 	if (smp_sufficient_security(hcon, sec_level))
 		return 1;
 
+	if (sec_level > hcon->pending_sec_level)
+		hcon->pending_sec_level = sec_level;
+
 	if (hcon->link_mode & HCI_LM_MASTER)
-		if (smp_ltk_encrypt(conn, sec_level))
-			goto done;
+		if (smp_ltk_encrypt(conn, hcon->pending_sec_level))
+			return 0;
 
 	if (test_and_set_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags))
 		return 0;
@@ -918,7 +926,7 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
 	 * requires it.
 	 */
 	if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT ||
-	    sec_level > BT_SECURITY_MEDIUM)
+	    hcon->pending_sec_level > BT_SECURITY_MEDIUM)
 		authreq |= SMP_AUTH_MITM;
 
 	if (hcon->link_mode & HCI_LM_MASTER) {
@@ -937,9 +945,6 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
 
 	set_bit(SMP_FLAG_INITIATOR, &smp->flags);
 
-done:
-	hcon->pending_sec_level = sec_level;
-
 	return 0;
 }
 
-- 
cgit 


From 581370cc74ea75426421a1f5851ef05e2e995b01 Mon Sep 17 00:00:00 2001
From: Johan Hedberg <johan.hedberg@intel.com>
Date: Tue, 17 Jun 2014 13:07:38 +0300
Subject: Bluetooth: Refactor authentication method lookup into its own
 function

We'll need to do authentication method lookups from more than one place,
so refactor the lookup into its own function.

Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/smp.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 0189ec8b68d1..7156f4720644 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -385,6 +385,16 @@ static const u8 gen_method[5][5] = {
 	{ CFM_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, OVERLAP     },
 };
 
+static u8 get_auth_method(struct smp_chan *smp, u8 local_io, u8 remote_io)
+{
+	/* If either side has unknown io_caps, use JUST WORKS */
+	if (local_io > SMP_IO_KEYBOARD_DISPLAY ||
+	    remote_io > SMP_IO_KEYBOARD_DISPLAY)
+		return JUST_WORKS;
+
+	return gen_method[remote_io][local_io];
+}
+
 static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth,
 						u8 local_io, u8 remote_io)
 {
@@ -401,14 +411,11 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth,
 	BT_DBG("tk_request: auth:%d lcl:%d rem:%d", auth, local_io, remote_io);
 
 	/* If neither side wants MITM, use JUST WORKS */
-	/* If either side has unknown io_caps, use JUST WORKS */
 	/* Otherwise, look up method from the table */
-	if (!(auth & SMP_AUTH_MITM) ||
-	    local_io > SMP_IO_KEYBOARD_DISPLAY ||
-	    remote_io > SMP_IO_KEYBOARD_DISPLAY)
+	if (!(auth & SMP_AUTH_MITM))
 		method = JUST_WORKS;
 	else
-		method = gen_method[remote_io][local_io];
+		method = get_auth_method(smp, local_io, remote_io);
 
 	/* If not bonding, don't ask user to confirm a Zero TK */
 	if (!(auth & SMP_AUTH_BONDING) && method == JUST_CFM)
-- 
cgit 


From 2ed8f65ca262bca778e60053f667ce11b32db6b8 Mon Sep 17 00:00:00 2001
From: Johan Hedberg <johan.hedberg@intel.com>
Date: Tue, 17 Jun 2014 13:07:39 +0300
Subject: Bluetooth: Fix rejecting pairing in case of insufficient capabilities

If we need an MITM protected connection but the local and remote IO
capabilities cannot provide it we should reject the pairing attempt in
the appropriate way. This patch adds the missing checks for such a
situation to the smp_cmd_pairing_req() and smp_cmd_pairing_rsp()
functions.

Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/smp.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 7156f4720644..e33a982161c1 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -706,6 +706,16 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
 	if (sec_level > conn->hcon->pending_sec_level)
 		conn->hcon->pending_sec_level = sec_level;
 
+	/* If we need MITM check that it can be acheived */
+	if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) {
+		u8 method;
+
+		method = get_auth_method(smp, conn->hcon->io_capability,
+					 req->io_capability);
+		if (method == JUST_WORKS || method == JUST_CFM)
+			return SMP_AUTH_REQUIREMENTS;
+	}
+
 	build_pairing_cmd(conn, req, &rsp, auth);
 
 	key_size = min(req->max_key_size, rsp.max_key_size);
@@ -752,6 +762,16 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb)
 	if (check_enc_key_size(conn, key_size))
 		return SMP_ENC_KEY_SIZE;
 
+	/* If we need MITM check that it can be acheived */
+	if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) {
+		u8 method;
+
+		method = get_auth_method(smp, req->io_capability,
+					 rsp->io_capability);
+		if (method == JUST_WORKS || method == JUST_CFM)
+			return SMP_AUTH_REQUIREMENTS;
+	}
+
 	get_random_bytes(smp->prnd, sizeof(smp->prnd));
 
 	smp->prsp[0] = SMP_CMD_PAIRING_RSP;
-- 
cgit 


From 1d56dc4f5f7cdf0ba99062d974b7586a28fc5cf4 Mon Sep 17 00:00:00 2001
From: Lukasz Rymanowski <lukasz.rymanowski@tieto.com>
Date: Tue, 17 Jun 2014 13:04:20 +0200
Subject: Bluetooth: Fix for ACL disconnect when pairing fails

When pairing fails hci_conn refcnt drops below zero. This cause that
ACL link is not disconnected when disconnect timeout fires.

Probably this is because l2cap_conn_del calls l2cap_chan_del for each
channel, and inside l2cap_chan_del conn is dropped. After that loop
hci_chan_del is called which also drops conn.

Anyway, as it is desrcibed in hci_core.h, it is known that refcnt
drops below 0 sometimes and it should be fine. If so, let disconnect
link when hci_conn_timeout fires and refcnt is 0 or below. This patch
does it.

This affects PTS test SM_TC_JW_BV_05_C

Logs from scenario:

[69713.706227] [6515] pair_device:
[69713.706230] [6515] hci_conn_add: hci0 dst 00:1b:dc:06:06:22
[69713.706233] [6515] hci_dev_hold: hci0 orig refcnt 8
[69713.706235] [6515] hci_conn_init_sysfs: conn ffff88021f65a000
[69713.706239] [6515] hci_req_add_ev: hci0 opcode 0x200d plen 25
[69713.706242] [6515] hci_prepare_cmd: skb len 28
[69713.706243] [6515] hci_req_run: length 1
[69713.706248] [6515] hci_conn_hold: hcon ffff88021f65a000 orig refcnt 0
[69713.706251] [6515] hci_dev_put: hci0 orig refcnt 9
[69713.706281] [8909] hci_cmd_work: hci0 cmd_cnt 1 cmd queued 1
[69713.706288] [8909] hci_send_frame: hci0 type 1 len 28
[69713.706290] [8909] hci_send_to_monitor: hdev ffff88021f0c7000 len 28
[69713.706316] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000
[69713.706382] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000
[69713.711664] [8909] hci_rx_work: hci0
[69713.711668] [8909] hci_send_to_monitor: hdev ffff88021f0c7000 len 6
[69713.711680] [8909] hci_rx_work: hci0 Event packet
[69713.711683] [8909] hci_cs_le_create_conn: hci0 status 0x00
[69713.711685] [8909] hci_sent_cmd_data: hci0 opcode 0x200d
[69713.711688] [8909] hci_req_cmd_complete: opcode 0x200d status 0x00
[69713.711690] [8909] hci_sent_cmd_data: hci0 opcode 0x200d
[69713.711695] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000
[69713.711744] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000
[69713.818875] [8909] hci_rx_work: hci0
[69713.818889] [8909] hci_send_to_monitor: hdev ffff88021f0c7000 len 21
[69713.818913] [8909] hci_rx_work: hci0 Event packet
[69713.818917] [8909] hci_le_conn_complete_evt: hci0 status 0x00
[69713.818922] [8909] hci_send_to_control: len 19
[69713.818927] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000
[69713.818938] [8909] hci_conn_add_sysfs: conn ffff88021f65a000
[69713.818975] [6450] bt_sock_poll: sock ffff88005e758500, sk ffff88010323b800
[69713.818981] [6515] hci_sock_recvmsg: sock ffff88005e75a080, sk ffff88010323ac00
...
[69713.819021] [8909] hci_dev_hold: hci0 orig refcnt 10
[69713.819025] [8909] l2cap_connect_cfm: hcon ffff88021f65a000 bdaddr 00:1b:dc:06:06:22 status 0
[69713.819028] [8909] hci_chan_create: hci0 hcon ffff88021f65a000
[69713.819031] [8909] l2cap_conn_add: hcon ffff88021f65a000 conn ffff880221005c00 hchan ffff88020d60b1c0
[69713.819034] [8909] l2cap_conn_ready: conn ffff880221005c00
[69713.819036] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000
[69713.819037] [8909] smp_conn_security: conn ffff880221005c00 hcon ffff88021f65a000 level 0x02
[69713.819039] [8909] smp_chan_create:
[69713.819041] [8909] hci_conn_hold: hcon ffff88021f65a000 orig refcnt 1
[69713.819043] [8909] smp_send_cmd: code 0x01
[69713.819045] [8909] hci_send_acl: hci0 chan ffff88020d60b1c0 flags 0x0000
[69713.819046] [5949] hci_sock_recvmsg: sock ffff8800941a9900, sk ffff88012bf4e800
[69713.819049] [8909] hci_queue_acl: hci0 nonfrag skb ffff88005157c100 len 15
[69713.819055] [5949] hci_sock_recvmsg: sock ffff8800941a9900, sk ffff88012bf4e800
[69713.819057] [8909] l2cap_le_conn_ready:
[69713.819064] [8909] l2cap_chan_create: chan ffff88005ede2c00
[69713.819066] [8909] l2cap_chan_hold: chan ffff88005ede2c00 orig refcnt 1
[69713.819069] [8909] l2cap_sock_init: sk ffff88005ede5800
[69713.819072] [8909] bt_accept_enqueue: parent ffff880160356000, sk ffff88005ede5800
[69713.819074] [8909] __l2cap_chan_add: conn ffff880221005c00, psm 0x00, dcid 0x0004
[69713.819076] [8909] l2cap_chan_hold: chan ffff88005ede2c00 orig refcnt 2
[69713.819078] [8909] hci_conn_hold: hcon ffff88021f65a000 orig refcnt 2
[69713.819080] [8909] smp_conn_security: conn ffff880221005c00 hcon ffff88021f65a000 level 0x01
[69713.819082] [8909] l2cap_sock_ready_cb: sk ffff88005ede5800, parent ffff880160356000
[69713.819086] [8909] le_pairing_complete_cb: status 0
[69713.819091] [8909] hci_tx_work: hci0 acl 10 sco 8 le 0
[69713.819093] [8909] hci_sched_acl: hci0
[69713.819094] [8909] hci_sched_sco: hci0
[69713.819096] [8909] hci_sched_esco: hci0
[69713.819098] [8909] hci_sched_le: hci0
[69713.819099] [8909] hci_chan_sent: hci0
[69713.819101] [8909] hci_chan_sent: chan ffff88020d60b1c0 quote 10
[69713.819104] [8909] hci_sched_le: chan ffff88020d60b1c0 skb ffff88005157c100 len 15 priority 7
[69713.819106] [8909] hci_send_frame: hci0 type 2 len 15
[69713.819108] [8909] hci_send_to_monitor: hdev ffff88021f0c7000 len 15
[69713.819119] [8909] hci_chan_sent: hci0
[69713.819121] [8909] hci_prio_recalculate: hci0
[69713.819123] [8909] process_pending_rx:
[69713.819226] [6450] hci_sock_recvmsg: sock ffff88005e758780, sk ffff88010323d400
...
[69713.822022] [6450] l2cap_sock_accept: sk ffff880160356000 timeo 0
[69713.822024] [6450] bt_accept_dequeue: parent ffff880160356000
[69713.822026] [6450] bt_accept_unlink: sk ffff88005ede5800 state 1
[69713.822028] [6450] l2cap_sock_accept: new socket ffff88005ede5800
[69713.822368] [6450] l2cap_sock_getname: sock ffff8800941ab700, sk ffff88005ede5800
[69713.822375] [6450] l2cap_sock_getsockopt: sk ffff88005ede5800
[69713.822383] [6450] l2cap_sock_getname: sock ffff8800941ab700, sk ffff88005ede5800
[69713.822414] [6450] bt_sock_poll: sock ffff8800941ab700, sk ffff88005ede5800
...
[69713.823255] [6450] l2cap_sock_getname: sock ffff8800941ab700, sk ffff88005ede5800
[69713.823259] [6450] l2cap_sock_getsockopt: sk ffff88005ede5800
[69713.824322] [6450] l2cap_sock_getname: sock ffff8800941ab700, sk ffff88005ede5800
[69713.824330] [6450] l2cap_sock_getsockopt: sk ffff88005ede5800
[69713.825029] [6450] bt_sock_poll: sock ffff88005e758500, sk ffff88010323b800
...
[69713.825187] [6450] l2cap_sock_sendmsg: sock ffff8800941ab700, sk ffff88005ede5800
[69713.825189] [6450] bt_sock_wait_ready: sk ffff88005ede5800
[69713.825192] [6450] l2cap_create_basic_pdu: chan ffff88005ede2c00 len 3
[69713.825196] [6450] l2cap_do_send: chan ffff88005ede2c00, skb ffff880160b0b500 len 7 priority 0
[69713.825199] [6450] hci_send_acl: hci0 chan ffff88020d60b1c0 flags 0x0000
[69713.825201] [6450] hci_queue_acl: hci0 nonfrag skb ffff880160b0b500 len 11
[69713.825210] [8909] hci_tx_work: hci0 acl 9 sco 8 le 0
[69713.825213] [8909] hci_sched_acl: hci0
[69713.825214] [8909] hci_sched_sco: hci0
[69713.825216] [8909] hci_sched_esco: hci0
[69713.825217] [8909] hci_sched_le: hci0
[69713.825219] [8909] hci_chan_sent: hci0
[69713.825221] [8909] hci_chan_sent: chan ffff88020d60b1c0 quote 9
[69713.825223] [8909] hci_sched_le: chan ffff88020d60b1c0 skb ffff880160b0b500 len 11 priority 0
[69713.825225] [8909] hci_send_frame: hci0 type 2 len 11
[69713.825227] [8909] hci_send_to_monitor: hdev ffff88021f0c7000 len 11
[69713.825242] [8909] hci_chan_sent: hci0
[69713.825253] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000
[69713.825253] [8909] hci_prio_recalculate: hci0
[69713.825292] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000
[69713.825768] [6450] bt_sock_poll: sock ffff88005e758500, sk ffff88010323b800
...
[69713.866902] [8909] hci_rx_work: hci0
[69713.866921] [8909] hci_send_to_monitor: hdev ffff88021f0c7000 len 7
[69713.866928] [8909] hci_rx_work: hci0 Event packet
[69713.866931] [8909] hci_num_comp_pkts_evt: hci0 num_hndl 1
[69713.866937] [8909] hci_tx_work: hci0 acl 9 sco 8 le 0
[69713.866939] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000
[69713.866940] [8909] hci_sched_acl: hci0
...
[69713.866944] [8909] hci_sched_le: hci0
[69713.866953] [8909] hci_chan_sent: hci0
[69713.866997] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000
[69713.867840] [28074] hci_rx_work: hci0
[69713.867844] [28074] hci_send_to_monitor: hdev ffff88021f0c7000 len 7
[69713.867850] [28074] hci_rx_work: hci0 Event packet
[69713.867853] [28074] hci_num_comp_pkts_evt: hci0 num_hndl 1
[69713.867857] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000
[69713.867858] [28074] hci_tx_work: hci0 acl 10 sco 8 le 0
[69713.867860] [28074] hci_sched_acl: hci0
[69713.867861] [28074] hci_sched_sco: hci0
[69713.867862] [28074] hci_sched_esco: hci0
[69713.867863] [28074] hci_sched_le: hci0
[69713.867865] [28074] hci_chan_sent: hci0
[69713.867888] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000
[69714.145661] [8909] hci_rx_work: hci0
[69714.145666] [8909] hci_send_to_monitor: hdev ffff88021f0c7000 len 10
[69714.145676] [8909] hci_rx_work: hci0 ACL data packet
[69714.145679] [8909] hci_acldata_packet: hci0 len 6 handle 0x002d flags 0x0002
[69714.145681] [8909] hci_conn_enter_active_mode: hcon ffff88021f65a000 mode 0
[69714.145683] [8909] l2cap_recv_acldata: conn ffff880221005c00 len 6 flags 0x2
[69714.145693] [8909] l2cap_recv_frame: len 2, cid 0x0006
[69714.145696] [8909] hci_send_to_control: len 14
[69714.145710] [8909] smp_chan_destroy:
[69714.145713] [8909] pairing_complete: status 3
[69714.145714] [8909] cmd_complete: sock ffff88010323ac00
[69714.145717] [8909] hci_conn_drop: hcon ffff88021f65a000 orig refcnt 3
[69714.145719] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000
[69714.145720] [6450] bt_sock_poll: sock ffff88005e758500, sk ffff88010323b800
[69714.145722] [6515] hci_sock_recvmsg: sock ffff88005e75a080, sk ffff88010323ac00
[69714.145724] [6450] bt_sock_poll: sock ffff8801db6b4f00, sk ffff880160351c00
...
[69714.145735] [6515] hci_sock_recvmsg: sock ffff88005e75a080, sk ffff88010323ac00
[69714.145737] [8909] hci_conn_drop: hcon ffff88021f65a000 orig refcnt 2
[69714.145739] [8909] l2cap_conn_del: hcon ffff88021f65a000 conn ffff880221005c00, err 13
[69714.145740] [6450] bt_sock_poll: sock ffff8801db6b5400, sk ffff88021e775000
[69714.145743] [6450] bt_sock_poll: sock ffff8801db6b5e00, sk ffff880160356000
[69714.145744] [8909] l2cap_chan_hold: chan ffff88005ede2c00 orig refcnt 3
[69714.145746] [6450] bt_sock_poll: sock ffff8800941ab700, sk ffff88005ede5800
[69714.145748] [8909] l2cap_chan_del: chan ffff88005ede2c00, conn ffff880221005c00, err 13
[69714.145749] [8909] l2cap_chan_put: chan ffff88005ede2c00 orig refcnt 4
[69714.145751] [8909] hci_conn_drop: hcon ffff88021f65a000 orig refcnt 1
[69714.145754] [6450] bt_sock_poll: sock ffff8800941ab700, sk ffff88005ede5800
[69714.145756] [8909] l2cap_chan_put: chan ffff88005ede2c00 orig refcnt 3
[69714.145759] [8909] hci_chan_del: hci0 hcon ffff88021f65a000 chan ffff88020d60b1c0
[69714.145766] [5949] hci_sock_recvmsg: sock ffff8800941a9680, sk ffff88012bf4d000
[69714.145787] [6515] hci_sock_release: sock ffff88005e75a080 sk ffff88010323ac00
[69714.146002] [6450] hci_sock_recvmsg: sock ffff88005e758780, sk ffff88010323d400
[69714.150795] [6450] l2cap_sock_release: sock ffff8800941ab700, sk ffff88005ede5800
[69714.150799] [6450] l2cap_sock_shutdown: sock ffff8800941ab700, sk ffff88005ede5800
[69714.150802] [6450] l2cap_chan_close: chan ffff88005ede2c00 state BT_CLOSED
[69714.150805] [6450] l2cap_sock_kill: sk ffff88005ede5800 state BT_CLOSED
[69714.150806] [6450] l2cap_chan_put: chan ffff88005ede2c00 orig refcnt 2
[69714.150808] [6450] l2cap_sock_destruct: sk ffff88005ede5800
[69714.150809] [6450] l2cap_chan_put: chan ffff88005ede2c00 orig refcnt 1
[69714.150811] [6450] l2cap_chan_destroy: chan ffff88005ede2c00
[69714.150970] [6450] bt_sock_poll: sock ffff88005e758500, sk ffff88010323b800
...
[69714.151991] [8909] hci_conn_drop: hcon ffff88021f65a000 orig refcnt 0
[69716.150339] [8909] hci_conn_timeout: hcon ffff88021f65a000 state BT_CONNECTED, refcnt -1

Signed-off-by: Lukasz Rymanowski <lukasz.rymanowski@tieto.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_conn.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index ca01d1861854..a7a27bc2c0b1 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -289,10 +289,20 @@ static void hci_conn_timeout(struct work_struct *work)
 {
 	struct hci_conn *conn = container_of(work, struct hci_conn,
 					     disc_work.work);
+	int refcnt = atomic_read(&conn->refcnt);
 
 	BT_DBG("hcon %p state %s", conn, state_to_string(conn->state));
 
-	if (atomic_read(&conn->refcnt))
+	WARN_ON(refcnt < 0);
+
+	/* FIXME: It was observed that in pairing failed scenario, refcnt
+	 * drops below 0. Probably this is because l2cap_conn_del calls
+	 * l2cap_chan_del for each channel, and inside l2cap_chan_del conn is
+	 * dropped. After that loop hci_chan_del is called which also drops
+	 * conn. For now make sure that ACL is alive if refcnt is higher then 0,
+	 * otherwise drop it.
+	 */
+	if (refcnt > 0)
 		return;
 
 	switch (conn->state) {
-- 
cgit 


From 744462a91ecf5d1ec64857488bf99000ef626921 Mon Sep 17 00:00:00 2001
From: Max Stepanov <Max.Stepanov@intel.com>
Date: Tue, 10 Jun 2014 20:00:08 +0300
Subject: mac80211: WEP extra head/tail room in ieee80211_send_auth

After skb allocation and call to ieee80211_wep_encrypt in ieee80211_send_auth
the flow fails with a warning in ieee80211_wep_add_iv on verification of
available head/tailroom needed for WEP_IV and WEP_ICV.

Signed-off-by: Max Stepanov <Max.Stepanov@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/util.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 6886601afe1c..a6cda52ed920 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1096,11 +1096,12 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 	int err;
 
 	/* 24 + 6 = header + auth_algo + auth_transaction + status_code */
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24 + 6 + extra_len);
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN +
+			    24 + 6 + extra_len + IEEE80211_WEP_ICV_LEN);
 	if (!skb)
 		return;
 
-	skb_reserve(skb, local->hw.extra_tx_headroom);
+	skb_reserve(skb, local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN);
 
 	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6);
 	memset(mgmt, 0, 24 + 6);
-- 
cgit 


From e33e2241e272eddc38339692500bd1c7d8753a77 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 23 Jun 2014 11:06:16 +0200
Subject: Revert "cfg80211: Use 5MHz bandwidth by default when checking usable
 channels"

This reverts commit 8eca1fb692cc9557f386eddce75c300a3855d11a.

Felix notes that this broke regulatory, leaving channel 12 open for AP
operation in the US regulatory domain where it isn't permitted.

Link: http://mid.gmane.org/53A6C0FF.9090104@openwrt.org
Reported-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/reg.c | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 558b0e3a02d8..1afdf45db38f 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -935,7 +935,7 @@ freq_reg_info_regd(struct wiphy *wiphy, u32 center_freq,
 		if (!band_rule_found)
 			band_rule_found = freq_in_rule_band(fr, center_freq);
 
-		bw_fits = reg_does_bw_fit(fr, center_freq, MHZ_TO_KHZ(5));
+		bw_fits = reg_does_bw_fit(fr, center_freq, MHZ_TO_KHZ(20));
 
 		if (band_rule_found && bw_fits)
 			return rr;
@@ -1019,10 +1019,10 @@ static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd,
 }
 #endif
 
-/* Find an ieee80211_reg_rule such that a 5MHz channel with frequency
- * chan->center_freq fits there.
- * If there is no such reg_rule, disable the channel, otherwise set the
- * flags corresponding to the bandwidths allowed in the particular reg_rule
+/*
+ * Note that right now we assume the desired channel bandwidth
+ * is always 20 MHz for each individual channel (HT40 uses 20 MHz
+ * per channel, the primary and the extension channel).
  */
 static void handle_channel(struct wiphy *wiphy,
 			   enum nl80211_reg_initiator initiator,
@@ -1083,12 +1083,8 @@ static void handle_channel(struct wiphy *wiphy,
 	if (reg_rule->flags & NL80211_RRF_AUTO_BW)
 		max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule);
 
-	if (max_bandwidth_khz < MHZ_TO_KHZ(10))
-		bw_flags = IEEE80211_CHAN_NO_10MHZ;
-	if (max_bandwidth_khz < MHZ_TO_KHZ(20))
-		bw_flags |= IEEE80211_CHAN_NO_20MHZ;
 	if (max_bandwidth_khz < MHZ_TO_KHZ(40))
-		bw_flags |= IEEE80211_CHAN_NO_HT40;
+		bw_flags = IEEE80211_CHAN_NO_HT40;
 	if (max_bandwidth_khz < MHZ_TO_KHZ(80))
 		bw_flags |= IEEE80211_CHAN_NO_80MHZ;
 	if (max_bandwidth_khz < MHZ_TO_KHZ(160))
@@ -1522,12 +1518,8 @@ static void handle_channel_custom(struct wiphy *wiphy,
 	if (reg_rule->flags & NL80211_RRF_AUTO_BW)
 		max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule);
 
-	if (max_bandwidth_khz < MHZ_TO_KHZ(10))
-		bw_flags = IEEE80211_CHAN_NO_10MHZ;
-	if (max_bandwidth_khz < MHZ_TO_KHZ(20))
-		bw_flags |= IEEE80211_CHAN_NO_20MHZ;
 	if (max_bandwidth_khz < MHZ_TO_KHZ(40))
-		bw_flags |= IEEE80211_CHAN_NO_HT40;
+		bw_flags = IEEE80211_CHAN_NO_HT40;
 	if (max_bandwidth_khz < MHZ_TO_KHZ(80))
 		bw_flags |= IEEE80211_CHAN_NO_80MHZ;
 	if (max_bandwidth_khz < MHZ_TO_KHZ(160))
-- 
cgit 


From 0ce12026d6ea4a24d52ddcde36b9643f2e26d560 Mon Sep 17 00:00:00 2001
From: Eliad Peller <eliad@wizery.com>
Date: Tue, 17 Jun 2014 13:07:02 +0300
Subject: cfg80211: fix elapsed_jiffies calculation

MAX_JIFFY_OFFSET has no meaning when calculating the
elapsed jiffies, as jiffies run out until ULONG_MAX.

This miscalculation results in erroneous values
in case of a wrap-around.

Signed-off-by: Eliad Peller <eliad@wizery.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/core.h b/net/wireless/core.h
index e9afbf10e756..7e3a3cef7df9 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -424,7 +424,7 @@ static inline unsigned int elapsed_jiffies_msecs(unsigned long start)
 	if (end >= start)
 		return jiffies_to_msecs(end - start);
 
-	return jiffies_to_msecs(end + (MAX_JIFFY_OFFSET - start) + 1);
+	return jiffies_to_msecs(end + (ULONG_MAX - start) + 1);
 }
 
 void
-- 
cgit 


From 02df00eb0019e7d15a1fcddebe4d020226c1ccda Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 10 Jun 2014 14:06:25 +0200
Subject: nl80211: move set_qos_map command into split state

The non-split wiphy state shouldn't be increased in size
so move the new set_qos_map command into the split if
statement.

Cc: stable@vger.kernel.org (3.14+)
Fixes: fa9ffc745610 ("cfg80211: Add support for QoS mapping")
Reviewed-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index ba4f1723c83a..6668daf69326 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1497,18 +1497,17 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 		}
 		CMD(start_p2p_device, START_P2P_DEVICE);
 		CMD(set_mcast_rate, SET_MCAST_RATE);
+#ifdef CONFIG_NL80211_TESTMODE
+		CMD(testmode_cmd, TESTMODE);
+#endif
 		if (state->split) {
 			CMD(crit_proto_start, CRIT_PROTOCOL_START);
 			CMD(crit_proto_stop, CRIT_PROTOCOL_STOP);
 			if (rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH)
 				CMD(channel_switch, CHANNEL_SWITCH);
+			CMD(set_qos_map, SET_QOS_MAP);
 		}
-		CMD(set_qos_map, SET_QOS_MAP);
-
-#ifdef CONFIG_NL80211_TESTMODE
-		CMD(testmode_cmd, TESTMODE);
-#endif
-
+		/* add into the if now */
 #undef CMD
 
 		if (rdev->ops->connect || rdev->ops->auth) {
-- 
cgit 


From 3e215c8d1b6b772d107f1811b5ee8eae7a046fb4 Mon Sep 17 00:00:00 2001
From: James M Leddy <james.leddy@redhat.com>
Date: Wed, 25 Jun 2014 17:38:13 -0400
Subject: udp: Add MIB counters for rcvbuferrors

Add MIB counters for rcvbuferrors in UDP to help diagnose problems.

Signed-off-by: James M Leddy <james.leddy@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 5 ++++-
 net/ipv6/udp.c | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d92f94b7e402..7d5a8661df76 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1588,8 +1588,11 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		goto csum_error;
 
 
-	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf))
+	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
+		UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+				 is_udplite);
 		goto drop;
+	}
 
 	rc = 0;
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 95c834799288..7092ff78fd84 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -674,8 +674,11 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 			goto csum_error;
 	}
 
-	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf))
+	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
+		UDP6_INC_STATS_BH(sock_net(sk),
+				  UDP_MIB_RCVBUFERRORS, is_udplite);
 		goto drop;
+	}
 
 	skb_dst_drop(skb);
 
@@ -690,6 +693,7 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	bh_unlock_sock(sk);
 
 	return rc;
+
 csum_error:
 	UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
 drop:
-- 
cgit 


From e940f5d6ba6a01f8dbb870854d5205d322452730 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Fri, 27 Jun 2014 09:57:53 +0800
Subject: ipv6: Fix MLD Query message check

Based on RFC3810 6.2, we also need to check the hop limit and router alert
option besides source address.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 08b367c6b9cf..617f0958e164 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1301,8 +1301,17 @@ int igmp6_event_query(struct sk_buff *skb)
 	len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr);
 	len -= skb_network_header_len(skb);
 
-	/* Drop queries with not link local source */
-	if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL))
+	/* RFC3810 6.2
+	 * Upon reception of an MLD message that contains a Query, the node
+	 * checks if the source address of the message is a valid link-local
+	 * address, if the Hop Limit is set to 1, and if the Router Alert
+	 * option is present in the Hop-By-Hop Options header of the IPv6
+	 * packet.  If any of these checks fails, the packet is dropped.
+	 */
+	if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL) ||
+	    ipv6_hdr(skb)->hop_limit != 1 ||
+	    !(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) ||
+	    IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD))
 		return -EINVAL;
 
 	idev = __in6_dev_get(skb->dev);
-- 
cgit 


From fe984c08e20f0fc2b4666bf8eeeb02605568387b Mon Sep 17 00:00:00 2001
From: Andy Zhou <azhou@nicira.com>
Date: Tue, 6 May 2014 17:23:48 -0700
Subject: openvswitch: Fix a double free bug for the sample action

When sample action returns with an error, the skb has already been
freed. This patch fix a bug to make sure we don't free it again.
This bug introduced by commit ccb1352e76cff05 (net: Add Open vSwitch
kernel components.)

Signed-off-by: Andy Zhou <azhou@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/actions.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index c36856a457ca..e70d8b18e962 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -551,6 +551,8 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 
 		case OVS_ACTION_ATTR_SAMPLE:
 			err = sample(dp, skb, a);
+			if (unlikely(err)) /* skb already freed. */
+				return err;
 			break;
 		}
 
-- 
cgit 


From e0bb8c44ed5cfcc56b571758ed966ee48779024c Mon Sep 17 00:00:00 2001
From: Wei Zhang <asuka.com@163.com>
Date: Sat, 28 Jun 2014 12:34:53 -0700
Subject: openvswitch: supply a dummy err_handler of gre_cisco_protocol to
 prevent kernel crash

When use gre vport, openvswitch register a gre_cisco_protocol but
does not supply a err_handler with it. The gre_cisco_err() in
net/ipv4/gre_demux.c expect err_handler be provided with the
gre_cisco_protocol implementation, and call ->err_handler() without
existence check, cause the kernel crash.

This patch provide a err_handler to fix this bug.
This bug introduced by commit aa310701e787087d (openvswitch: Add gre
tunnel support.)

Signed-off-by: Wei Zhang <asuka.com@163.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/vport-gre.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'net')

diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 35ec4fed09e2..f49148a07da2 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -110,6 +110,22 @@ static int gre_rcv(struct sk_buff *skb,
 	return PACKET_RCVD;
 }
 
+/* Called with rcu_read_lock and BH disabled. */
+static int gre_err(struct sk_buff *skb, u32 info,
+		   const struct tnl_ptk_info *tpi)
+{
+	struct ovs_net *ovs_net;
+	struct vport *vport;
+
+	ovs_net = net_generic(dev_net(skb->dev), ovs_net_id);
+	vport = rcu_dereference(ovs_net->vport_net.gre_vport);
+
+	if (unlikely(!vport))
+		return PACKET_REJECT;
+	else
+		return PACKET_RCVD;
+}
+
 static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 {
 	struct net *net = ovs_dp_get_net(vport->dp);
@@ -186,6 +202,7 @@ error:
 
 static struct gre_cisco_protocol gre_protocol = {
 	.handler        = gre_rcv,
+	.err_handler    = gre_err,
 	.priority       = 1,
 };
 
-- 
cgit 


From ad55200734c65a3ec5d0c39d6ea904008baea536 Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@nicira.com>
Date: Tue, 6 May 2014 16:48:38 -0700
Subject: openvswitch: Fix tracking of flags seen in TCP flows.

Flow statistics need to take into account the TCP flags from the packet
currently being processed (in 'key'), not the TCP flags matched by the
flow found in the kernel flow table (in 'flow').

This bug made the Open vSwitch userspace fin_timeout action have no effect
in many cases.
This bug is introduced by commit 88d73f6c411ac2f0578 (openvswitch: Use
TCP flags in the flow key for stats.)

Reported-by: Len Gao <leng@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/datapath.c | 4 ++--
 net/openvswitch/flow.c     | 4 ++--
 net/openvswitch/flow.h     | 5 +++--
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 0d407bca81e3..a863678c50ac 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2013 Nicira, Inc.
+ * Copyright (c) 2007-2014 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -276,7 +276,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
 	OVS_CB(skb)->flow = flow;
 	OVS_CB(skb)->pkt_key = &key;
 
-	ovs_flow_stats_update(OVS_CB(skb)->flow, skb);
+	ovs_flow_stats_update(OVS_CB(skb)->flow, key.tp.flags, skb);
 	ovs_execute_actions(dp, skb);
 	stats_counter = &stats->n_hit;
 
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 334751cb1528..d07ab538fc9d 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -61,10 +61,10 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies)
 
 #define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF))
 
-void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
+void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
+			   struct sk_buff *skb)
 {
 	struct flow_stats *stats;
-	__be16 tcp_flags = flow->key.tp.flags;
 	int node = numa_node_id();
 
 	stats = rcu_dereference(flow->stats[node]);
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index ac395d2cd821..5e5aaed3a85b 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2013 Nicira, Inc.
+ * Copyright (c) 2007-2014 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -180,7 +180,8 @@ struct arp_eth_header {
 	unsigned char       ar_tip[4];		/* target IP address        */
 } __packed;
 
-void ovs_flow_stats_update(struct sw_flow *, struct sk_buff *);
+void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags,
+			   struct sk_buff *);
 void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *,
 			unsigned long *used, __be16 *tcp_flags);
 void ovs_flow_stats_clear(struct sw_flow *);
-- 
cgit 


From 4a46b24e147dfa9b858026da02cad0bdd4e149d2 Mon Sep 17 00:00:00 2001
From: Alex Wang <alexw@nicira.com>
Date: Mon, 30 Jun 2014 20:30:29 -0700
Subject: openvswitch: Use exact lookup for flow_get and flow_del.

Due to the race condition in userspace, there is chance that two
overlapping megaflows could be installed in datapath.  And this
causes userspace unable to delete the less inclusive megaflow flow
even after it timeout, since the flow_del logic will stop at the
first match of masked flow.

This commit fixes the bug by making the kernel flow_del and flow_get
logic check all masks in that case.

Introduced by 03f0d916a (openvswitch: Mega flow implementation).

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/datapath.c   | 23 +++++++++++------------
 net/openvswitch/flow_table.c | 16 ++++++++++++++++
 net/openvswitch/flow_table.h |  3 ++-
 3 files changed, 29 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index a863678c50ac..9db4bf6740d1 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -889,8 +889,11 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 		}
 		/* The unmasked key has to be the same for flow updates. */
 		if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) {
-			error = -EEXIST;
-			goto err_unlock_ovs;
+			flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
+			if (!flow) {
+				error = -ENOENT;
+				goto err_unlock_ovs;
+			}
 		}
 		/* Update actions. */
 		old_acts = ovsl_dereference(flow->sf_acts);
@@ -981,16 +984,12 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
 		goto err_unlock_ovs;
 	}
 	/* Check that the flow exists. */
-	flow = ovs_flow_tbl_lookup(&dp->table, &key);
+	flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
 	if (unlikely(!flow)) {
 		error = -ENOENT;
 		goto err_unlock_ovs;
 	}
-	/* The unmasked key has to be the same for flow updates. */
-	if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) {
-		error = -EEXIST;
-		goto err_unlock_ovs;
-	}
+
 	/* Update actions, if present. */
 	if (likely(acts)) {
 		old_acts = ovsl_dereference(flow->sf_acts);
@@ -1063,8 +1062,8 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
 		goto unlock;
 	}
 
-	flow = ovs_flow_tbl_lookup(&dp->table, &key);
-	if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) {
+	flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
+	if (!flow) {
 		err = -ENOENT;
 		goto unlock;
 	}
@@ -1113,8 +1112,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 		goto unlock;
 	}
 
-	flow = ovs_flow_tbl_lookup(&dp->table, &key);
-	if (unlikely(!flow || !ovs_flow_cmp_unmasked_key(flow, &match))) {
+	flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
+	if (unlikely(!flow)) {
 		err = -ENOENT;
 		goto unlock;
 	}
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 574c3abc9b30..cf2d853646f0 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -456,6 +456,22 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
 	return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit);
 }
 
+struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
+					  struct sw_flow_match *match)
+{
+	struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
+	struct sw_flow_mask *mask;
+	struct sw_flow *flow;
+
+	/* Always called under ovs-mutex. */
+	list_for_each_entry(mask, &tbl->mask_list, list) {
+		flow = masked_flow_lookup(ti, match->key, mask);
+		if (flow && ovs_flow_cmp_unmasked_key(flow, match))  /* Found */
+			return flow;
+	}
+	return NULL;
+}
+
 int ovs_flow_tbl_num_masks(const struct flow_table *table)
 {
 	struct sw_flow_mask *mask;
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index ca8a5820f615..5918bff7f3f6 100644
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -76,7 +76,8 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *,
 				    u32 *n_mask_hit);
 struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *,
 				    const struct sw_flow_key *);
-
+struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
+					  struct sw_flow_match *match);
 bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
 			       struct sw_flow_match *match);
 
-- 
cgit 


From 7f502361531e9eecb396cf99bdc9e9a59f7ebd7f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Jun 2014 01:26:23 -0700
Subject: ipv4: irq safe sk_dst_[re]set() and ipv4_sk_update_pmtu() fix

We have two different ways to handle changes to sk->sk_dst

First way (used by TCP) assumes socket lock is owned by caller, and use
no extra lock : __sk_dst_set() & __sk_dst_reset()

Another way (used by UDP) uses sk_dst_lock because socket lock is not
always taken. Note that sk_dst_lock is not softirq safe.

These ways are not inter changeable for a given socket type.

ipv4_sk_update_pmtu(), added in linux-3.8, added a race, as it used
the socket lock as synchronization, but users might be UDP sockets.

Instead of converting sk_dst_lock to a softirq safe version, use xchg()
as we did for sk_rx_dst in commit e47eb5dfb296b ("udp: ipv4: do not use
sk_dst_lock from softirq context")

In a follow up patch, we probably can remove sk_dst_lock, as it is
only used in IPv6.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Fixes: 9cb3a50c5f63e ("ipv4: Invalidate the socket cached route on pmtu events if possible")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 082239ffe34a..3162ea923ded 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1010,7 +1010,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 	const struct iphdr *iph = (const struct iphdr *) skb->data;
 	struct flowi4 fl4;
 	struct rtable *rt;
-	struct dst_entry *dst;
+	struct dst_entry *odst = NULL;
 	bool new = false;
 
 	bh_lock_sock(sk);
@@ -1018,16 +1018,17 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 	if (!ip_sk_accept_pmtu(sk))
 		goto out;
 
-	rt = (struct rtable *) __sk_dst_get(sk);
+	odst = sk_dst_get(sk);
 
-	if (sock_owned_by_user(sk) || !rt) {
+	if (sock_owned_by_user(sk) || !odst) {
 		__ipv4_sk_update_pmtu(skb, sk, mtu);
 		goto out;
 	}
 
 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
 
-	if (!__sk_dst_check(sk, 0)) {
+	rt = (struct rtable *)odst;
+	if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
 		if (IS_ERR(rt))
 			goto out;
@@ -1037,8 +1038,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 
 	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
 
-	dst = dst_check(&rt->dst, 0);
-	if (!dst) {
+	if (!dst_check(&rt->dst, 0)) {
 		if (new)
 			dst_release(&rt->dst);
 
@@ -1050,10 +1050,11 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 	}
 
 	if (new)
-		__sk_dst_set(sk, &rt->dst);
+		sk_dst_set(sk, &rt->dst);
 
 out:
 	bh_unlock_sock(sk);
+	dst_release(odst);
 }
 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
 
-- 
cgit 


From a48e5fafecfb9c0c807d7e7284b5ff884dfb7a3a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 2 Jul 2014 02:25:15 -0700
Subject: vlan: free percpu stats in device destructor

Madalin-Cristian reported crashs happening after a recent commit
(5a4ae5f6e7d4 "vlan: unnecessary to check if vlan_pcpu_stats is NULL")

-----------------------------------------------------------------------
root@p5040ds:~# vconfig add eth8 1
root@p5040ds:~# vconfig rem eth8.1
Unable to handle kernel paging request for data at address 0x2bc88028
Faulting instruction address: 0xc058e950
Oops: Kernel access of bad area, sig: 11 [#1]
SMP NR_CPUS=8 CoreNet Generic
Modules linked in:
CPU: 3 PID: 2167 Comm: vconfig Tainted: G        W     3.16.0-rc3-00346-g65e85bf #2
task: e7264d90 ti: e2c2c000 task.ti: e2c2c000
NIP: c058e950 LR: c058ea30 CTR: c058e900
REGS: e2c2db20 TRAP: 0300   Tainted: G        W      (3.16.0-rc3-00346-g65e85bf)
MSR: 00029002 <CE,EE,ME>  CR: 48000428  XER: 20000000
DEAR: 2bc88028 ESR: 00000000
GPR00: c047299c e2c2dbd0 e7264d90 00000000 2bc88000 00000000 ffffffff 00000000
GPR08: 0000000f 00000000 000000ff 00000000 28000422 10121928 10100000 10100000
GPR16: 10100000 00000000 c07c5968 00000000 00000000 00000000 e2c2dc48 e7838000
GPR24: c07c5bac c07c58a8 e77290cc c07b0000 00000000 c05de6c0 e7838000 e2c2dc48
NIP [c058e950] vlan_dev_get_stats64+0x50/0x170
LR [c058ea30] vlan_dev_get_stats64+0x130/0x170
Call Trace:
[e2c2dbd0] [ffffffea] 0xffffffea (unreliable)
[e2c2dc20] [c047299c] dev_get_stats+0x4c/0x140
[e2c2dc40] [c0488ca8] rtnl_fill_ifinfo+0x3d8/0x960
[e2c2dd70] [c0489f4c] rtmsg_ifinfo+0x6c/0x110
[e2c2dd90] [c04731d4] rollback_registered_many+0x344/0x3b0
[e2c2ddd0] [c047332c] rollback_registered+0x2c/0x50
[e2c2ddf0] [c0476058] unregister_netdevice_queue+0x78/0xf0
[e2c2de00] [c058d800] unregister_vlan_dev+0xc0/0x160
[e2c2de20] [c058e360] vlan_ioctl_handler+0x1c0/0x550
[e2c2de90] [c045d11c] sock_ioctl+0x28c/0x2f0
[e2c2deb0] [c010d070] do_vfs_ioctl+0x90/0x7b0
[e2c2df20] [c010d7d0] SyS_ioctl+0x40/0x80
[e2c2df40] [c000f924] ret_from_syscall+0x0/0x3c

Fix this problem by freeing percpu stats from dev->destructor() instead
of ndo_uninit()

Reported-by: Madalin-Cristian Bucur <madalin.bucur@freescale.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Tested-by: Madalin-Cristian Bucur <madalin.bucur@freescale.com>
Fixes: 5a4ae5f6e7d4 ("vlan: unnecessary to check if vlan_pcpu_stats is NULL")
Cc: Li RongQing <roy.qing.li@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan_dev.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index ad2ac3c00398..dd11f612e03e 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -627,8 +627,6 @@ static void vlan_dev_uninit(struct net_device *dev)
 	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
 	int i;
 
-	free_percpu(vlan->vlan_pcpu_stats);
-	vlan->vlan_pcpu_stats = NULL;
 	for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) {
 		while ((pm = vlan->egress_priority_map[i]) != NULL) {
 			vlan->egress_priority_map[i] = pm->next;
@@ -785,6 +783,15 @@ static const struct net_device_ops vlan_netdev_ops = {
 	.ndo_get_lock_subclass  = vlan_dev_get_lock_subclass,
 };
 
+static void vlan_dev_free(struct net_device *dev)
+{
+	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+
+	free_percpu(vlan->vlan_pcpu_stats);
+	vlan->vlan_pcpu_stats = NULL;
+	free_netdev(dev);
+}
+
 void vlan_setup(struct net_device *dev)
 {
 	ether_setup(dev);
@@ -794,7 +801,7 @@ void vlan_setup(struct net_device *dev)
 	dev->tx_queue_len	= 0;
 
 	dev->netdev_ops		= &vlan_netdev_ops;
-	dev->destructor		= free_netdev;
+	dev->destructor		= vlan_dev_free;
 	dev->ethtool_ops	= &vlan_ethtool_ops;
 
 	memset(dev->broadcast, 0, ETH_ALEN);
-- 
cgit 


From 5924f17a8a30c2ae18d034a86ee7581b34accef6 Mon Sep 17 00:00:00 2001
From: Christoph Paasch <christoph.paasch@uclouvain.be>
Date: Sat, 28 Jun 2014 18:26:37 +0200
Subject: tcp: Fix divide by zero when pushing during tcp-repair

When in repair-mode and TCP_RECV_QUEUE is set, we end up calling
tcp_push with mss_now being 0. If data is in the send-queue and
tcp_set_skb_tso_segs gets called, we crash because it will divide by
mss_now:

[  347.151939] divide error: 0000 [#1] SMP
[  347.152907] Modules linked in:
[  347.152907] CPU: 1 PID: 1123 Comm: packetdrill Not tainted 3.16.0-rc2 #4
[  347.152907] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[  347.152907] task: f5b88540 ti: f3c82000 task.ti: f3c82000
[  347.152907] EIP: 0060:[<c1601359>] EFLAGS: 00210246 CPU: 1
[  347.152907] EIP is at tcp_set_skb_tso_segs+0x49/0xa0
[  347.152907] EAX: 00000b67 EBX: f5acd080 ECX: 00000000 EDX: 00000000
[  347.152907] ESI: f5a28f40 EDI: f3c88f00 EBP: f3c83d10 ESP: f3c83d00
[  347.152907]  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
[  347.152907] CR0: 80050033 CR2: 083158b0 CR3: 35146000 CR4: 000006b0
[  347.152907] Stack:
[  347.152907]  c167f9d9 f5acd080 000005b4 00000002 f3c83d20 c16013e6 f3c88f00 f5acd080
[  347.152907]  f3c83da0 c1603b5a f3c83d38 c10a0188 00000000 00000000 f3c83d84 c10acc85
[  347.152907]  c1ad5ec0 00000000 00000000 c1ad679c 010003e0 00000000 00000000 f3c88fc8
[  347.152907] Call Trace:
[  347.152907]  [<c167f9d9>] ? apic_timer_interrupt+0x2d/0x34
[  347.152907]  [<c16013e6>] tcp_init_tso_segs+0x36/0x50
[  347.152907]  [<c1603b5a>] tcp_write_xmit+0x7a/0xbf0
[  347.152907]  [<c10a0188>] ? up+0x28/0x40
[  347.152907]  [<c10acc85>] ? console_unlock+0x295/0x480
[  347.152907]  [<c10ad24f>] ? vprintk_emit+0x1ef/0x4b0
[  347.152907]  [<c1605716>] __tcp_push_pending_frames+0x36/0xd0
[  347.152907]  [<c15f4860>] tcp_push+0xf0/0x120
[  347.152907]  [<c15f7641>] tcp_sendmsg+0xf1/0xbf0
[  347.152907]  [<c116d920>] ? kmem_cache_free+0xf0/0x120
[  347.152907]  [<c106a682>] ? __sigqueue_free+0x32/0x40
[  347.152907]  [<c106a682>] ? __sigqueue_free+0x32/0x40
[  347.152907]  [<c114f0f0>] ? do_wp_page+0x3e0/0x850
[  347.152907]  [<c161c36a>] inet_sendmsg+0x4a/0xb0
[  347.152907]  [<c1150269>] ? handle_mm_fault+0x709/0xfb0
[  347.152907]  [<c15a006b>] sock_aio_write+0xbb/0xd0
[  347.152907]  [<c1180b79>] do_sync_write+0x69/0xa0
[  347.152907]  [<c1181023>] vfs_write+0x123/0x160
[  347.152907]  [<c1181d55>] SyS_write+0x55/0xb0
[  347.152907]  [<c167f0d8>] sysenter_do_call+0x12/0x28

This can easily be reproduced with the following packetdrill-script (the
"magic" with netem, sk_pacing and limit_output_bytes is done to prevent
the kernel from pushing all segments, because hitting the limit without
doing this is not so easy with packetdrill):

0   socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0  setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0

+0  bind(3, ..., ...) = 0
+0  listen(3, 1) = 0

+0  < S 0:0(0) win 32792 <mss 1460>
+0  > S. 0:0(0) ack 1 <mss 1460>
+0.1  < . 1:1(0) ack 1 win 65000

+0  accept(3, ..., ...) = 4

// This forces that not all segments of the snd-queue will be pushed
+0 `tc qdisc add dev tun0 root netem delay 10ms`
+0 `sysctl -w net.ipv4.tcp_limit_output_bytes=2`
+0 setsockopt(4, SOL_SOCKET, 47, [2], 4) = 0

+0 write(4,...,10000) = 10000
+0 write(4,...,10000) = 10000

// Set tcp-repair stuff, particularly TCP_RECV_QUEUE
+0 setsockopt(4, SOL_TCP, 19, [1], 4) = 0
+0 setsockopt(4, SOL_TCP, 20, [1], 4) = 0

// This now will make the write push the remaining segments
+0 setsockopt(4, SOL_SOCKET, 47, [20000], 4) = 0
+0 `sysctl -w net.ipv4.tcp_limit_output_bytes=130000`

// Now we will crash
+0 write(4,...,1000) = 1000

This happens since ec3423257508 (tcp: fix retransmission in repair
mode). Prior to that, the call to tcp_push was prevented by a check for
tp->repair.

The patch fixes it, by adding the new goto-label out_nopush. When exiting
tcp_sendmsg and a push is not required, which is the case for tp->repair,
we go to this label.

When repairing and calling send() with TCP_RECV_QUEUE, the data is
actually put in the receive-queue. So, no push is required because no
data has been added to the send-queue.

Cc: Andrew Vagin <avagin@openvz.org>
Cc: Pavel Emelyanov <xemul@parallels.com>
Fixes: ec3423257508 (tcp: fix retransmission in repair mode)
Signed-off-by: Christoph Paasch <christoph.paasch@uclouvain.be>
Acked-by: Andrew Vagin <avagin@openvz.org>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index eb1dde37e678..9d2118e5fbc7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1108,7 +1108,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	if (unlikely(tp->repair)) {
 		if (tp->repair_queue == TCP_RECV_QUEUE) {
 			copied = tcp_send_rcvq(sk, msg, size);
-			goto out;
+			goto out_nopush;
 		}
 
 		err = -EINVAL;
@@ -1282,6 +1282,7 @@ wait_for_memory:
 out:
 	if (copied)
 		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+out_nopush:
 	release_sock(sk);
 	return copied + copied_syn;
 
-- 
cgit 


From 68b7107b62983f2cff0948292429d5f5999df096 Mon Sep 17 00:00:00 2001
From: Edward Allcutt <edward.allcutt@openmarket.com>
Date: Mon, 30 Jun 2014 16:16:02 +0100
Subject: ipv4: icmp: Fix pMTU handling for rare case

Some older router implementations still send Fragmentation Needed
errors with the Next-Hop MTU field set to zero. This is explicitly
described as an eventuality that hosts must deal with by the
standard (RFC 1191) since older standards specified that those
bits must be zero.

Linux had a generic (for all of IPv4) implementation of the algorithm
described in the RFC for searching a list of MTU plateaus for a good
value. Commit 46517008e116 ("ipv4: Kill ip_rt_frag_needed().")
removed this as part of the changes to remove the routing cache.
Subsequently any Fragmentation Needed packet with a zero Next-Hop
MTU has been discarded without being passed to the per-protocol
handlers or notifying userspace for raw sockets.

When there is a router which does not implement RFC 1191 on an
MTU limited path then this results in stalled connections since
large packets are discarded and the local protocols are not
notified so they never attempt to lower the pMTU.

One example I have seen is an OpenBSD router terminating IPSec
tunnels. It's worth pointing out that this case is distinct from
the BSD 4.2 bug which incorrectly calculated the Next-Hop MTU
since the commit in question dismissed that as a valid concern.

All of the per-protocols handlers implement the simple approach from
RFC 1191 of immediately falling back to the minimum value. Although
this is sub-optimal it is vastly preferable to connections hanging
indefinitely.

Remove the Next-Hop MTU != 0 check and allow such packets
to follow the normal path.

Fixes: 46517008e116 ("ipv4: Kill ip_rt_frag_needed().")
Signed-off-by: Edward Allcutt <edward.allcutt@openmarket.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 79c3d947a481..42b7bcf8045b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -739,8 +739,6 @@ static void icmp_unreach(struct sk_buff *skb)
 				/* fall through */
 			case 0:
 				info = ntohs(icmph->un.frag.mtu);
-				if (!info)
-					goto out;
 			}
 			break;
 		case ICMP_SR_FAILED:
-- 
cgit 


From 11ef7a8996d5d433c9cd75d80651297eccbf6d42 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Mon, 30 Jun 2014 09:50:40 -0700
Subject: net: Performance fix for process_backlog

In process_backlog the input_pkt_queue is only checked once for new
packets and quota is artificially reduced to reflect precisely the
number of packets on the input_pkt_queue so that the loop exits
appropriately.

This patches changes the behavior to be more straightforward and
less convoluted. Packets are processed until either the quota
is met or there are no more packets to process.

This patch seems to provide a small, but noticeable performance
improvement. The performance improvement is a result of staying
in the process_backlog loop longer which can reduce number of IPI's.

Performance data using super_netperf TCP_RR with 200 flows:

Before fix:

88.06% CPU utilization
125/190/309 90/95/99% latencies
1.46808e+06 tps
1145382 intrs.sec.

With fix:

87.73% CPU utilization
122/183/296 90/95/99% latencies
1.4921e+06 tps
1021674.30 intrs./sec.

Signed-off-by: Tom Herbert <therbert@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 30eedf677913..77c19c7bb490 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4227,9 +4227,8 @@ static int process_backlog(struct napi_struct *napi, int quota)
 #endif
 	napi->weight = weight_p;
 	local_irq_disable();
-	while (work < quota) {
+	while (1) {
 		struct sk_buff *skb;
-		unsigned int qlen;
 
 		while ((skb = __skb_dequeue(&sd->process_queue))) {
 			local_irq_enable();
@@ -4243,24 +4242,24 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		}
 
 		rps_lock(sd);
-		qlen = skb_queue_len(&sd->input_pkt_queue);
-		if (qlen)
-			skb_queue_splice_tail_init(&sd->input_pkt_queue,
-						   &sd->process_queue);
-
-		if (qlen < quota - work) {
+		if (skb_queue_empty(&sd->input_pkt_queue)) {
 			/*
 			 * Inline a custom version of __napi_complete().
 			 * only current cpu owns and manipulates this napi,
-			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
-			 * we can use a plain write instead of clear_bit(),
+			 * and NAPI_STATE_SCHED is the only possible flag set
+			 * on backlog.
+			 * We can use a plain write instead of clear_bit(),
 			 * and we dont need an smp_mb() memory barrier.
 			 */
 			list_del(&napi->poll_list);
 			napi->state = 0;
+			rps_unlock(sd);
 
-			quota = work + qlen;
+			break;
 		}
+
+		skb_queue_splice_tail_init(&sd->input_pkt_queue,
+					   &sd->process_queue);
 		rps_unlock(sd);
 	}
 	local_irq_enable();
-- 
cgit 


From 54951194656e4853e441266fd095f880bc0398f3 Mon Sep 17 00:00:00 2001
From: Loic Prylli <loicp@google.com>
Date: Tue, 1 Jul 2014 21:39:43 -0700
Subject: net: Fix NETDEV_CHANGE notifier usage causing spurious arp flush

A bug was introduced in NETDEV_CHANGE notifier sequence causing the
arp table to be sometimes spuriously cleared (including manual arp
entries marked permanent), upon network link carrier changes.

The changed argument for the notifier was applied only to a single
caller of NETDEV_CHANGE, missing among others netdev_state_change().
So upon net_carrier events induced by the network, which are
triggering a call to netdev_state_change(), arp_netdev_event() would
decide whether to clear or not arp cache based on random/junk stack
values (a kind of read buffer overflow).

Fixes: be9efd365328 ("net: pass changed flags along with NETDEV_CHANGE event")
Fixes: 6c8b4e3ff81b ("arp: flush arp cache on IFF_NOARP change")
Signed-off-by: Loic Prylli <loicp@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 77c19c7bb490..7990984ca364 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -148,6 +148,9 @@ struct list_head ptype_all __read_mostly;	/* Taps */
 static struct list_head offload_base __read_mostly;
 
 static int netif_rx_internal(struct sk_buff *skb);
+static int call_netdevice_notifiers_info(unsigned long val,
+					 struct net_device *dev,
+					 struct netdev_notifier_info *info);
 
 /*
  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
@@ -1207,7 +1210,11 @@ EXPORT_SYMBOL(netdev_features_change);
 void netdev_state_change(struct net_device *dev)
 {
 	if (dev->flags & IFF_UP) {
-		call_netdevice_notifiers(NETDEV_CHANGE, dev);
+		struct netdev_notifier_change_info change_info;
+
+		change_info.flags_changed = 0;
+		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
+					      &change_info.info);
 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
 	}
 }
-- 
cgit 


From 52ad353a5344f1f700c5b777175bdfa41d3cd65a Mon Sep 17 00:00:00 2001
From: dingtianhong <dingtianhong@huawei.com>
Date: Wed, 2 Jul 2014 13:50:48 +0800
Subject: igmp: fix the problem when mc leave group

The problem was triggered by these steps:

1) create socket, bind and then setsockopt for add mc group.
   mreq.imr_multiaddr.s_addr = inet_addr("255.0.0.37");
   mreq.imr_interface.s_addr = inet_addr("192.168.1.2");
   setsockopt(sockfd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq));

2) drop the mc group for this socket.
   mreq.imr_multiaddr.s_addr = inet_addr("255.0.0.37");
   mreq.imr_interface.s_addr = inet_addr("0.0.0.0");
   setsockopt(sockfd, IPPROTO_IP, IP_DROP_MEMBERSHIP, &mreq, sizeof(mreq));

3) and then drop the socket, I found the mc group was still used by the dev:

   netstat -g

   Interface       RefCnt Group
   --------------- ------ ---------------------
   eth2		   1	  255.0.0.37

Normally even though the IP_DROP_MEMBERSHIP return error, the mc group still need
to be released for the netdev when drop the socket, but this process was broken when
route default is NULL, the reason is that:

The ip_mc_leave_group() will choose the in_dev by the imr_interface.s_addr, if input addr
is NULL, the default route dev will be chosen, then the ifindex is got from the dev,
then polling the inet->mc_list and return -ENODEV, but if the default route dev is NULL,
the in_dev and ifIndex is both NULL, when polling the inet->mc_list, the mc group will be
released from the mc_list, but the dev didn't dec the refcnt for this mc group, so
when dropping the socket, the mc_list is NULL and the dev still keep this group.

v1->v2: According Hideaki's suggestion, we should align with IPv6 (RFC3493) and BSDs,
	so I add the checking for the in_dev before polling the mc_list, make sure when
	we remove the mc group, dec the refcnt to the real dev which was using the mc address.
	The problem would never happened again.

Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 6748d420f714..db710b059bab 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1944,6 +1944,10 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
 
 	rtnl_lock();
 	in_dev = ip_mc_find_dev(net, imr);
+	if (!in_dev) {
+		ret = -ENODEV;
+		goto out;
+	}
 	ifindex = imr->imr_ifindex;
 	for (imlp = &inet->mc_list;
 	     (iml = rtnl_dereference(*imlp)) != NULL;
@@ -1961,16 +1965,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
 
 		*imlp = iml->next_rcu;
 
-		if (in_dev)
-			ip_mc_dec_group(in_dev, group);
+		ip_mc_dec_group(in_dev, group);
 		rtnl_unlock();
 		/* decrease mem now to avoid the memleak warning */
 		atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
 		kfree_rcu(iml, rcu);
 		return 0;
 	}
-	if (!in_dev)
-		ret = -ENODEV;
+out:
 	rtnl_unlock();
 	return ret;
 }
-- 
cgit 


From 6e08d5e3c8236e7484229e46fdf92006e1dd4c49 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 2 Jul 2014 12:07:16 -0700
Subject: tcp: fix false undo corner cases

The undo code assumes that, upon entering loss recovery, TCP
1) always retransmit something
2) the retransmission never fails locally (e.g., qdisc drop)

so undo_marker is set in tcp_enter_recovery() and undo_retrans is
incremented only when tcp_retransmit_skb() is successful.

When the assumption is broken because TCP's cwnd is too small to
retransmit or the retransmit fails locally. The next (DUP)ACK
would incorrectly revert the cwnd and the congestion state in
tcp_try_undo_dsack() or tcp_may_undo(). Subsequent (DUP)ACKs
may enter the recovery state. The sender repeatedly enter and
(incorrectly) exit recovery states if the retransmits continue to
fail locally while receiving (DUP)ACKs.

The fix is to initialize undo_retrans to -1 and start counting on
the first retransmission. Always increment undo_retrans even if the
retransmissions fail locally because they couldn't cause DSACKs to
undo the cwnd reduction.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c  | 8 ++++----
 net/ipv4/tcp_output.c | 6 ++++--
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b5c23756965a..40639c288dc2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1106,7 +1106,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
 	}
 
 	/* D-SACK for already forgotten data... Do dumb counting. */
-	if (dup_sack && tp->undo_marker && tp->undo_retrans &&
+	if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
 	    !after(end_seq_0, prior_snd_una) &&
 	    after(end_seq_0, tp->undo_marker))
 		tp->undo_retrans--;
@@ -1187,7 +1187,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
 
 	/* Account D-SACK for retransmitted packet. */
 	if (dup_sack && (sacked & TCPCB_RETRANS)) {
-		if (tp->undo_marker && tp->undo_retrans &&
+		if (tp->undo_marker && tp->undo_retrans > 0 &&
 		    after(end_seq, tp->undo_marker))
 			tp->undo_retrans--;
 		if (sacked & TCPCB_SACKED_ACKED)
@@ -1893,7 +1893,7 @@ static void tcp_clear_retrans_partial(struct tcp_sock *tp)
 	tp->lost_out = 0;
 
 	tp->undo_marker = 0;
-	tp->undo_retrans = 0;
+	tp->undo_retrans = -1;
 }
 
 void tcp_clear_retrans(struct tcp_sock *tp)
@@ -2665,7 +2665,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 
 	tp->prior_ssthresh = 0;
 	tp->undo_marker = tp->snd_una;
-	tp->undo_retrans = tp->retrans_out;
+	tp->undo_retrans = tp->retrans_out ? : -1;
 
 	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
 		if (!ece_ack)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d92bce0ea24e..179b51e6bda3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2525,8 +2525,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 		if (!tp->retrans_stamp)
 			tp->retrans_stamp = TCP_SKB_CB(skb)->when;
 
-		tp->undo_retrans += tcp_skb_pcount(skb);
-
 		/* snd_nxt is stored to detect loss of retransmitted segment,
 		 * see tcp_input.c tcp_sacktag_write_queue().
 		 */
@@ -2534,6 +2532,10 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	} else if (err != -EBUSY) {
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
 	}
+
+	if (tp->undo_retrans < 0)
+		tp->undo_retrans = 0;
+	tp->undo_retrans += tcp_skb_pcount(skb);
 	return err;
 }
 
-- 
cgit 


From 29322d0db98e5a84f5cc6a55655bee3dc4ffb5ab Mon Sep 17 00:00:00 2001
From: Jon Paul Maloy <jon.maloy@ericsson.com>
Date: Sat, 5 Jul 2014 13:44:13 -0400
Subject: tipc: fix bug in multicast/broadcast message reassembly

Since commit 37e22164a8a3c39bdad45aa463b1e69a1fdf4110 ("tipc: rename and
move message reassembly function") reassembly of long broadcast messages
has been broken. This is because we test for a non-NULL return value
of the *buf parameter as criteria for succesful reassembly. However, this
parameter is left defined even after reception of the first fragment,
when reassebly is still incomplete. This leads to a kernel crash as soon
as a the first fragment of a long broadcast message is received.

We fix this with this commit, by implementing a stricter behavior of the
function and its return values.

This commit should be applied to both net and net-next.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/msg.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 8be6e94a1ca9..0a37a472c29f 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -101,9 +101,11 @@ int tipc_msg_build(struct tipc_msg *hdr, struct iovec const *msg_sect,
 }
 
 /* tipc_buf_append(): Append a buffer to the fragment list of another buffer
- * Let first buffer become head buffer
- * Returns 1 and sets *buf to headbuf if chain is complete, otherwise 0
- * Leaves headbuf pointer at NULL if failure
+ * @*headbuf: in:  NULL for first frag, otherwise value returned from prev call
+ *            out: set when successful non-complete reassembly, otherwise NULL
+ * @*buf:     in:  the buffer to append. Always defined
+ *            out: head buf after sucessful complete reassembly, otherwise NULL
+ * Returns 1 when reassembly complete, otherwise 0
  */
 int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf)
 {
@@ -122,6 +124,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf)
 			goto out_free;
 		head = *headbuf = frag;
 		skb_frag_list_init(head);
+		*buf = NULL;
 		return 0;
 	}
 	if (!head)
@@ -150,5 +153,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf)
 out_free:
 	pr_warn_ratelimited("Unable to build fragment list\n");
 	kfree_skb(*buf);
+	kfree_skb(*headbuf);
+	*buf = *headbuf = NULL;
 	return 0;
 }
-- 
cgit 


From e0056593b61253f1a8a9941dacda22e73b963cdc Mon Sep 17 00:00:00 2001
From: Dmitry Popov <ixaphire@qrator.net>
Date: Sat, 5 Jul 2014 02:26:37 +0400
Subject: ip_tunnel: fix ip_tunnel_lookup

This patch fixes 3 similar bugs where incoming packets might be routed into
wrong non-wildcard tunnels:

1) Consider the following setup:
    ip address add 1.1.1.1/24 dev eth0
    ip address add 1.1.1.2/24 dev eth0
    ip tunnel add ipip1 remote 2.2.2.2 local 1.1.1.1 mode ipip dev eth0
    ip link set ipip1 up

Incoming ipip packets from 2.2.2.2 were routed into ipip1 even if it has dst =
1.1.1.2. Moreover even if there was wildcard tunnel like
   ip tunnel add ipip0 remote 2.2.2.2 local any mode ipip dev eth0
but it was created before explicit one (with local 1.1.1.1), incoming ipip
packets with src = 2.2.2.2 and dst = 1.1.1.2 were still routed into ipip1.

Same issue existed with all tunnels that use ip_tunnel_lookup (gre, vti)

2)  ip address add 1.1.1.1/24 dev eth0
    ip tunnel add ipip1 remote 2.2.146.85 local 1.1.1.1 mode ipip dev eth0
    ip link set ipip1 up

Incoming ipip packets with dst = 1.1.1.1 were routed into ipip1, no matter what
src address is. Any remote ip address which has ip_tunnel_hash = 0 raised this
issue, 2.2.146.85 is just an example, there are more than 4 million of them.
And again, wildcard tunnel like
   ip tunnel add ipip0 remote any local 1.1.1.1 mode ipip dev eth0
wouldn't be ever matched if it was created before explicit tunnel like above.

Gre & vti tunnels had the same issue.

3)  ip address add 1.1.1.1/24 dev eth0
    ip tunnel add gre1 remote 2.2.146.84 local 1.1.1.1 key 1 mode gre dev eth0
    ip link set gre1 up

Any incoming gre packet with key = 1 were routed into gre1, no matter what
src/dst addresses are. Any remote ip address which has ip_tunnel_hash = 0 raised
the issue, 2.2.146.84 is just an example, there are more than 4 million of them.
Wildcard tunnel like
   ip tunnel add gre2 remote any local any key 1 mode gre dev eth0
wouldn't be ever matched if it was created before explicit tunnel like above.

All this stuff happened because while looking for a wildcard tunnel we didn't
check that matched tunnel is a wildcard one. Fixed.

Signed-off-by: Dmitry Popov <ixaphire@qrator.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 54b6731dab55..6f9de61dce5f 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -169,6 +169,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
 
 	hlist_for_each_entry_rcu(t, head, hash_node) {
 		if (remote != t->parms.iph.daddr ||
+		    t->parms.iph.saddr != 0 ||
 		    !(t->dev->flags & IFF_UP))
 			continue;
 
@@ -185,10 +186,11 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
 	head = &itn->tunnels[hash];
 
 	hlist_for_each_entry_rcu(t, head, hash_node) {
-		if ((local != t->parms.iph.saddr &&
-		     (local != t->parms.iph.daddr ||
-		      !ipv4_is_multicast(local))) ||
-		    !(t->dev->flags & IFF_UP))
+		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
+		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
+			continue;
+
+		if (!(t->dev->flags & IFF_UP))
 			continue;
 
 		if (!ip_tunnel_key_match(&t->parms, flags, key))
@@ -205,6 +207,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
 
 	hlist_for_each_entry_rcu(t, head, hash_node) {
 		if (t->parms.i_key != key ||
+		    t->parms.iph.saddr != 0 ||
+		    t->parms.iph.daddr != 0 ||
 		    !(t->dev->flags & IFF_UP))
 			continue;
 
-- 
cgit 


From 36beddc272c111689f3042bf3d10a64d8a805f93 Mon Sep 17 00:00:00 2001
From: Andrey Utkin <andrey.krieger.utkin@gmail.com>
Date: Mon, 7 Jul 2014 23:22:50 +0300
Subject: appletalk: Fix socket referencing in skb

Setting just skb->sk without taking its reference and setting a
destructor is invalid. However, in the places where this was done, skb
is used in a way not requiring skb->sk setting. So dropping the setting
of skb->sk.
Thanks to Eric Dumazet <eric.dumazet@gmail.com> for correct solution.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=79441
Reported-by: Ed Martin <edman007@edman007.com>
Signed-off-by: Andrey Utkin <andrey.krieger.utkin@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/appletalk/ddp.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 01a1082e02b3..bfcf6be1d665 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1489,8 +1489,6 @@ static int atalk_rcv(struct sk_buff *skb, struct net_device *dev,
 		goto drop;
 
 	/* Queue packet (standard) */
-	skb->sk = sock;
-
 	if (sock_queue_rcv_skb(sock, skb) < 0)
 		goto drop;
 
@@ -1644,7 +1642,6 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
 	if (!skb)
 		goto out;
 
-	skb->sk = sk;
 	skb_reserve(skb, ddp_dl->header_length);
 	skb_reserve(skb, dev->hard_header_len);
 	skb->dev = dev;
-- 
cgit 


From ac30ef832e6af0505b6f0251a6659adcfa74975e Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@nicira.com>
Date: Wed, 9 Jul 2014 10:31:22 -0700
Subject: netlink: Fix handling of error from netlink_dump().

netlink_dump() returns a negative errno value on error.  Until now,
netlink_recvmsg() directly recorded that negative value in sk->sk_err, but
that's wrong since sk_err takes positive errno values.  (This manifests as
userspace receiving a positive return value from the recv() system call,
falsely indicating success.) This bug was introduced in the commit that
started checking the netlink_dump() return value, commit b44d211 (netlink:
handle errors from netlink_dump()).

Multithreaded Netlink dumps are one way to trigger this behavior in
practice, as described in the commit message for the userspace workaround
posted here:
    http://openvswitch.org/pipermail/dev/2014-June/042339.html

This commit also fixes the same bug in netlink_poll(), introduced in commit
cd1df525d (netlink: add flow control for memory mapped I/O).

Signed-off-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 15c731f03fa6..e6fac7e3db52 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -636,7 +636,7 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock,
 		while (nlk->cb_running && netlink_dump_space(nlk)) {
 			err = netlink_dump(sk);
 			if (err < 0) {
-				sk->sk_err = err;
+				sk->sk_err = -err;
 				sk->sk_error_report(sk);
 				break;
 			}
@@ -2483,7 +2483,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	    atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
 		ret = netlink_dump(sk);
 		if (ret) {
-			sk->sk_err = ret;
+			sk->sk_err = -ret;
 			sk->sk_error_report(sk);
 		}
 	}
-- 
cgit 


From d0a7ebbc119738439ff00f7fadbd343ae20ea5e8 Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Thu, 10 Jul 2014 17:29:21 -0700
Subject: GRE: enable offloads for GRE

To get offloads to work with Generic Routing Encapsulation (GRE), the
outer transport header has to be reset after skb_push is done. This
patch has the support for this fix and hence GRE offloading.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Signed-off-by: Joseph Gasparakis <joseph.gasparakis@intel.com>
Tested-By: Jim Young <jamesx.m.young@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/gre_demux.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 4e9619bca732..0485bf7f8f03 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -68,6 +68,7 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 
 	skb_push(skb, hdr_len);
 
+	skb_reset_transport_header(skb);
 	greh = (struct gre_base_hdr *)skb->data;
 	greh->flags = tnl_flags_to_gre_flags(tpi->flags);
 	greh->protocol = tpi->proto;
-- 
cgit 


From 999417549c16dd0e3a382aa9f6ae61688db03181 Mon Sep 17 00:00:00 2001
From: Jon Paul Maloy <jon.maloy@ericsson.com>
Date: Fri, 11 Jul 2014 08:45:27 -0400
Subject: tipc: clear 'next'-pointer of message fragments before reassembly

If the 'next' pointer of the last fragment buffer in a message is not
zeroed before reassembly, we risk ending up with a corrupt message,
since the reassembly function itself isn't doing this.

Currently, when a buffer is retrieved from the deferred queue of the
broadcast link, the next pointer is not cleared, with the result as
described above.

This commit corrects this, and thereby fixes a bug that may occur when
long broadcast messages are transmitted across dual interfaces. The bug
has been present since 40ba3cdf542a469aaa9083fa041656e59b109b90 ("tipc:
message reassembly using fragment chain")

This commit should be applied to both net and net-next.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bcast.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 26631679a1fa..55c6c9d3e1ce 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -559,6 +559,7 @@ receive:
 
 		buf = node->bclink.deferred_head;
 		node->bclink.deferred_head = buf->next;
+		buf->next = NULL;
 		node->bclink.deferred_size--;
 		goto receive;
 	}
-- 
cgit 


From 8f2e5ae40ec193bc0a0ed99e95315c3eebca84ea Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Sat, 12 Jul 2014 20:30:35 +0200
Subject: net: sctp: fix information leaks in ulpevent layer

While working on some other SCTP code, I noticed that some
structures shared with user space are leaking uninitialized
stack or heap buffer. In particular, struct sctp_sndrcvinfo
has a 2 bytes hole between .sinfo_flags and .sinfo_ppid that
remains unfilled by us in sctp_ulpevent_read_sndrcvinfo() when
putting this into cmsg. But also struct sctp_remote_error
contains a 2 bytes hole that we don't fill but place into a skb
through skb_copy_expand() via sctp_ulpevent_make_remote_error().

Both structures are defined by the IETF in RFC6458:

* Section 5.3.2. SCTP Header Information Structure:

  The sctp_sndrcvinfo structure is defined below:

  struct sctp_sndrcvinfo {
    uint16_t sinfo_stream;
    uint16_t sinfo_ssn;
    uint16_t sinfo_flags;
    <-- 2 bytes hole  -->
    uint32_t sinfo_ppid;
    uint32_t sinfo_context;
    uint32_t sinfo_timetolive;
    uint32_t sinfo_tsn;
    uint32_t sinfo_cumtsn;
    sctp_assoc_t sinfo_assoc_id;
  };

* 6.1.3. SCTP_REMOTE_ERROR:

  A remote peer may send an Operation Error message to its peer.
  This message indicates a variety of error conditions on an
  association. The entire ERROR chunk as it appears on the wire
  is included in an SCTP_REMOTE_ERROR event. Please refer to the
  SCTP specification [RFC4960] and any extensions for a list of
  possible error formats. An SCTP error notification has the
  following format:

  struct sctp_remote_error {
    uint16_t sre_type;
    uint16_t sre_flags;
    uint32_t sre_length;
    uint16_t sre_error;
    <-- 2 bytes hole  -->
    sctp_assoc_t sre_assoc_id;
    uint8_t  sre_data[];
  };

Fix this by setting both to 0 before filling them out. We also
have other structures shared between user and kernel space in
SCTP that contains holes (e.g. struct sctp_paddrthlds), but we
copy that buffer over from user space first and thus don't need
to care about it in that cases.

While at it, we can also remove lengthy comments copied from
the draft, instead, we update the comment with the correct RFC
number where one can look it up.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/ulpevent.c | 122 +++++++---------------------------------------------
 1 file changed, 15 insertions(+), 107 deletions(-)

(limited to 'net')

diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 85c64658bd0b..b6842fdb53d4 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -366,9 +366,10 @@ fail:
  * specification [SCTP] and any extensions for a list of possible
  * error formats.
  */
-struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
-	const struct sctp_association *asoc, struct sctp_chunk *chunk,
-	__u16 flags, gfp_t gfp)
+struct sctp_ulpevent *
+sctp_ulpevent_make_remote_error(const struct sctp_association *asoc,
+				struct sctp_chunk *chunk, __u16 flags,
+				gfp_t gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_remote_error *sre;
@@ -387,8 +388,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
 	/* Copy the skb to a new skb with room for us to prepend
 	 * notification with.
 	 */
-	skb = skb_copy_expand(chunk->skb, sizeof(struct sctp_remote_error),
-			      0, gfp);
+	skb = skb_copy_expand(chunk->skb, sizeof(*sre), 0, gfp);
 
 	/* Pull off the rest of the cause TLV from the chunk.  */
 	skb_pull(chunk->skb, elen);
@@ -399,62 +399,21 @@ struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
 	event = sctp_skb2event(skb);
 	sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize);
 
-	sre = (struct sctp_remote_error *)
-		skb_push(skb, sizeof(struct sctp_remote_error));
+	sre = (struct sctp_remote_error *) skb_push(skb, sizeof(*sre));
 
 	/* Trim the buffer to the right length.  */
-	skb_trim(skb, sizeof(struct sctp_remote_error) + elen);
+	skb_trim(skb, sizeof(*sre) + elen);
 
-	/* Socket Extensions for SCTP
-	 * 5.3.1.3 SCTP_REMOTE_ERROR
-	 *
-	 * sre_type:
-	 *   It should be SCTP_REMOTE_ERROR.
-	 */
+	/* RFC6458, Section 6.1.3. SCTP_REMOTE_ERROR */
+	memset(sre, 0, sizeof(*sre));
 	sre->sre_type = SCTP_REMOTE_ERROR;
-
-	/*
-	 * Socket Extensions for SCTP
-	 * 5.3.1.3 SCTP_REMOTE_ERROR
-	 *
-	 * sre_flags: 16 bits (unsigned integer)
-	 *   Currently unused.
-	 */
 	sre->sre_flags = 0;
-
-	/* Socket Extensions for SCTP
-	 * 5.3.1.3 SCTP_REMOTE_ERROR
-	 *
-	 * sre_length: sizeof (__u32)
-	 *
-	 * This field is the total length of the notification data,
-	 * including the notification header.
-	 */
 	sre->sre_length = skb->len;
-
-	/* Socket Extensions for SCTP
-	 * 5.3.1.3 SCTP_REMOTE_ERROR
-	 *
-	 * sre_error: 16 bits (unsigned integer)
-	 * This value represents one of the Operational Error causes defined in
-	 * the SCTP specification, in network byte order.
-	 */
 	sre->sre_error = cause;
-
-	/* Socket Extensions for SCTP
-	 * 5.3.1.3 SCTP_REMOTE_ERROR
-	 *
-	 * sre_assoc_id: sizeof (sctp_assoc_t)
-	 *
-	 * The association id field, holds the identifier for the association.
-	 * All notifications for a given association have the same association
-	 * identifier.  For TCP style socket, this field is ignored.
-	 */
 	sctp_ulpevent_set_owner(event, asoc);
 	sre->sre_assoc_id = sctp_assoc2id(asoc);
 
 	return event;
-
 fail:
 	return NULL;
 }
@@ -899,7 +858,9 @@ __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event)
 	return notification->sn_header.sn_type;
 }
 
-/* Copy out the sndrcvinfo into a msghdr.  */
+/* RFC6458, Section 5.3.2. SCTP Header Information Structure
+ * (SCTP_SNDRCV, DEPRECATED)
+ */
 void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event,
 				   struct msghdr *msghdr)
 {
@@ -908,74 +869,21 @@ void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event,
 	if (sctp_ulpevent_is_notification(event))
 		return;
 
-	/* Sockets API Extensions for SCTP
-	 * Section 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV)
-	 *
-	 * sinfo_stream: 16 bits (unsigned integer)
-	 *
-	 * For recvmsg() the SCTP stack places the message's stream number in
-	 * this value.
-	*/
+	memset(&sinfo, 0, sizeof(sinfo));
 	sinfo.sinfo_stream = event->stream;
-	/* sinfo_ssn: 16 bits (unsigned integer)
-	 *
-	 * For recvmsg() this value contains the stream sequence number that
-	 * the remote endpoint placed in the DATA chunk.  For fragmented
-	 * messages this is the same number for all deliveries of the message
-	 * (if more than one recvmsg() is needed to read the message).
-	 */
 	sinfo.sinfo_ssn = event->ssn;
-	/* sinfo_ppid: 32 bits (unsigned integer)
-	 *
-	 * In recvmsg() this value is
-	 * the same information that was passed by the upper layer in the peer
-	 * application.  Please note that byte order issues are NOT accounted
-	 * for and this information is passed opaquely by the SCTP stack from
-	 * one end to the other.
-	 */
 	sinfo.sinfo_ppid = event->ppid;
-	/* sinfo_flags: 16 bits (unsigned integer)
-	 *
-	 * This field may contain any of the following flags and is composed of
-	 * a bitwise OR of these values.
-	 *
-	 * recvmsg() flags:
-	 *
-	 * SCTP_UNORDERED - This flag is present when the message was sent
-	 *                 non-ordered.
-	 */
 	sinfo.sinfo_flags = event->flags;
-	/* sinfo_tsn: 32 bit (unsigned integer)
-	 *
-	 * For the receiving side, this field holds a TSN that was
-	 * assigned to one of the SCTP Data Chunks.
-	 */
 	sinfo.sinfo_tsn = event->tsn;
-	/* sinfo_cumtsn: 32 bit (unsigned integer)
-	 *
-	 * This field will hold the current cumulative TSN as
-	 * known by the underlying SCTP layer.  Note this field is
-	 * ignored when sending and only valid for a receive
-	 * operation when sinfo_flags are set to SCTP_UNORDERED.
-	 */
 	sinfo.sinfo_cumtsn = event->cumtsn;
-	/* sinfo_assoc_id: sizeof (sctp_assoc_t)
-	 *
-	 * The association handle field, sinfo_assoc_id, holds the identifier
-	 * for the association announced in the COMMUNICATION_UP notification.
-	 * All notifications for a given association have the same identifier.
-	 * Ignored for one-to-one style sockets.
-	 */
 	sinfo.sinfo_assoc_id = sctp_assoc2id(event->asoc);
-
-	/* context value that is set via SCTP_CONTEXT socket option. */
+	/* Context value that is set via SCTP_CONTEXT socket option. */
 	sinfo.sinfo_context = event->asoc->default_rcv_context;
-
 	/* These fields are not used while receiving. */
 	sinfo.sinfo_timetolive = 0;
 
 	put_cmsg(msghdr, IPPROTO_SCTP, SCTP_SNDRCV,
-		 sizeof(struct sctp_sndrcvinfo), (void *)&sinfo);
+		 sizeof(sinfo), &sinfo);
 }
 
 /* Do accounting for bytes received and hold a reference to the association
-- 
cgit 


From 9ecf07a1d8f70f72ec99a0f102c8aa24609d84f4 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Sat, 12 Jul 2014 22:36:44 +0200
Subject: neigh: sysctl - simplify address calculation of gc_* variables

The code in neigh_sysctl_register() relies on a specific layout of
struct neigh_table, namely that the 'gc_*' variables are directly
following the 'parms' member in a specific order. The code, though,
expresses this in the most ugly way.

Get rid of the ugly casts and use the 'tbl' pointer to get a handle to
the table. This way we can refer to the 'gc_*' variables directly.

Similarly seen in the grsecurity patch, written by Brad Spengler.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Cc: Brad Spengler <spender@grsecurity.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 32d872eec7f5..559890b0f0a2 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -3059,11 +3059,12 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
 		memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
 		       sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
 	} else {
+		struct neigh_table *tbl = p->tbl;
 		dev_name_source = "default";
-		t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1);
-		t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1;
-		t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2;
-		t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3;
+		t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval;
+		t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1;
+		t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2;
+		t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3;
 	}
 
 	if (handler) {
-- 
cgit