From b53542073927878b18d642f6bf794adef6d45a18 Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@freescale.com>
Date: Wed, 22 Jun 2005 09:58:03 -0500
Subject: [PATCH] Fix extra double quote in IPV4 Kconfig

Kconfig option had an extra double quote at the end of the line
which was causing in warning when building.

Signed-off-by: Kumar Gala <kumar.gala@freescale.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/ipv4/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 05107e0dc145..567b03b1c349 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -2,7 +2,7 @@
 # IP configuration
 #
 choice 
-	prompt "Choose IP: FIB lookup""
+	prompt "Choose IP: FIB lookup"
 	depends on INET
 	default IP_FIB_HASH
 
-- 
cgit 


From 5d927eb0101eb791fb2d4f72b49a2da5faf01941 Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Wed, 22 Jun 2005 12:37:50 -0700
Subject: [NETFILTER]: Fix handling of ICMP packets (RELATED) in ipt_CLUSTERIP
 target.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index dc4362b57cfa..9cde8c61f525 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -339,7 +339,7 @@ target(struct sk_buff **pskb,
 	 * error messages (RELATED) and information requests (see below) */
 	if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
 	    && (ctinfo == IP_CT_RELATED 
-		|| ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY))
+		|| ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY))
 		return IPT_CONTINUE;
 
 	/* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, 
-- 
cgit 


From d05fdb0cec75415b2d9eb95748386e67414e49c3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:19 +0000
Subject: [PATCH] RPC: Fix a race with rpc_restart_call()

 If the task->tk_exit() wants to restart the RPC call after delaying
 then the current RPC code will clobber the timer by calling
 rpc_delete_timer() immediately after re-entering the loop in
 __rpc_execute().

 Problem noticed by Oleg Nesterov <oleg@tv-sign.ru>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/sched.c | 53 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index c06614d0e31d..cc298fa4b81d 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -554,6 +554,30 @@ __rpc_atrun(struct rpc_task *task)
 	rpc_wake_up_task(task);
 }
 
+/*
+ * Helper that calls task->tk_exit if it exists and then returns
+ * true if we should exit __rpc_execute.
+ */
+static inline int __rpc_do_exit(struct rpc_task *task)
+{
+	if (task->tk_exit != NULL) {
+		lock_kernel();
+		task->tk_exit(task);
+		unlock_kernel();
+		/* If tk_action is non-null, we should restart the call */
+		if (task->tk_action != NULL) {
+			if (!RPC_ASSASSINATED(task)) {
+				/* Release RPC slot and buffer memory */
+				xprt_release(task);
+				rpc_free(task);
+				return 0;
+			}
+			printk(KERN_ERR "RPC: dead task tried to walk away.\n");
+		}
+	}
+	return 1;
+}
+
 /*
  * This is the RPC `scheduler' (or rather, the finite state machine).
  */
@@ -566,8 +590,7 @@ static int __rpc_execute(struct rpc_task *task)
 
 	BUG_ON(RPC_IS_QUEUED(task));
 
- restarted:
-	while (1) {
+	for (;;) {
 		/*
 		 * Garbage collection of pending timers...
 		 */
@@ -600,11 +623,12 @@ static int __rpc_execute(struct rpc_task *task)
 		 * by someone else.
 		 */
 		if (!RPC_IS_QUEUED(task)) {
-			if (!task->tk_action)
+			if (task->tk_action != NULL) {
+				lock_kernel();
+				task->tk_action(task);
+				unlock_kernel();
+			} else if (__rpc_do_exit(task))
 				break;
-			lock_kernel();
-			task->tk_action(task);
-			unlock_kernel();
 		}
 
 		/*
@@ -645,23 +669,6 @@ static int __rpc_execute(struct rpc_task *task)
 		dprintk("RPC: %4d sync task resuming\n", task->tk_pid);
 	}
 
-	if (task->tk_exit) {
-		lock_kernel();
-		task->tk_exit(task);
-		unlock_kernel();
-		/* If tk_action is non-null, the user wants us to restart */
-		if (task->tk_action) {
-			if (!RPC_ASSASSINATED(task)) {
-				/* Release RPC slot and buffer memory */
-				if (task->tk_rqstp)
-					xprt_release(task);
-				rpc_free(task);
-				goto restarted;
-			}
-			printk(KERN_ERR "RPC: dead task tries to walk away.\n");
-		}
-	}
-
 	dprintk("RPC: %4d exit() = %d\n", task->tk_pid, task->tk_status);
 	status = task->tk_status;
 
-- 
cgit 


From 334ccfd545bba9690515f2c5c167d5adb161989b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:19 +0000
Subject: [PATCH] RPC: Ensure XDR iovec length is initialized correctly in
 call_header

 Fix up call_header() so that it calls xdr_adjust_iovec().
 Fix calculation of the scratch buffer length in xdr_init_encode().

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c |  4 +++-
 net/sunrpc/svc.c  |  1 +
 net/sunrpc/xdr.c  | 18 +++++++++++++++---
 3 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 02bc029d46fe..209aaf595695 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -957,7 +957,9 @@ call_header(struct rpc_task *task)
 	*p++ = htonl(clnt->cl_prog);	/* program number */
 	*p++ = htonl(clnt->cl_vers);	/* program version */
 	*p++ = htonl(task->tk_msg.rpc_proc->p_proc);	/* procedure */
-	return rpcauth_marshcred(task, p);
+	p = rpcauth_marshcred(task, p);
+	req->rq_slen = xdr_adjust_iovec(&req->rq_svec[0], p);
+	return p;
 }
 
 /*
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index bb2d99f33315..a02d424a7409 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -281,6 +281,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
 	rqstp->rq_res.len = 0;
 	rqstp->rq_res.page_base = 0;
 	rqstp->rq_res.page_len = 0;
+	rqstp->rq_res.buflen = PAGE_SIZE;
 	rqstp->rq_res.tail[0].iov_len = 0;
 	/* tcp needs a space for the record length... */
 	if (rqstp->rq_prot == IPPROTO_TCP)
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 67b9f035ba86..f86d1baa6302 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -616,12 +616,24 @@ xdr_shift_buf(struct xdr_buf *buf, size_t len)
 void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p)
 {
 	struct kvec *iov = buf->head;
+	int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
 
+	BUG_ON(scratch_len < 0);
 	xdr->buf = buf;
 	xdr->iov = iov;
-	xdr->end = (uint32_t *)((char *)iov->iov_base + iov->iov_len);
-	buf->len = iov->iov_len = (char *)p - (char *)iov->iov_base;
-	xdr->p = p;
+	xdr->p = (uint32_t *)((char *)iov->iov_base + iov->iov_len);
+	xdr->end = (uint32_t *)((char *)iov->iov_base + scratch_len);
+	BUG_ON(iov->iov_len > scratch_len);
+
+	if (p != xdr->p && p != NULL) {
+		size_t len;
+
+		BUG_ON(p < xdr->p || p > xdr->end);
+		len = (char *)p - (char *)xdr->p;
+		xdr->p = p;
+		buf->len += len;
+		iov->iov_len += len;
+	}
 }
 EXPORT_SYMBOL(xdr_init_encode);
 
-- 
cgit 


From 5b616f5d596c0b056129f8aeafbc08409b3cd050 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:20 +0000
Subject: [PATCH] RPC: Make rpc_create_client() destroy the transport on
 failure.

 This saves us a couple of lines of cleanup code for each call.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c        | 1 +
 net/sunrpc/pmap_clnt.c   | 4 +---
 net/sunrpc/sunrpc_syms.c | 1 -
 3 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 209aaf595695..99515d7727a6 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -178,6 +178,7 @@ out_no_path:
 		kfree(clnt->cl_server);
 	kfree(clnt);
 out_err:
+	xprt_destroy(xprt);
 	return ERR_PTR(err);
 }
 
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index d0b1d2c34a4d..97c420ff1ee0 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -210,9 +210,7 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto)
 	clnt = rpc_create_client(xprt, hostname,
 				&pmap_program, RPC_PMAP_VERSION,
 				RPC_AUTH_UNIX);
-	if (IS_ERR(clnt)) {
-		xprt_destroy(xprt);
-	} else {
+	if (!IS_ERR(clnt)) {
 		clnt->cl_softrtry = 1;
 		clnt->cl_chatty   = 1;
 		clnt->cl_oneshot  = 1;
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index d4f26bf9e732..1b0ff7e0e869 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -61,7 +61,6 @@ EXPORT_SYMBOL(rpc_mkpipe);
 
 /* Client transport */
 EXPORT_SYMBOL(xprt_create_proto);
-EXPORT_SYMBOL(xprt_destroy);
 EXPORT_SYMBOL(xprt_set_timeout);
 EXPORT_SYMBOL(xprt_udp_slot_table_entries);
 EXPORT_SYMBOL(xprt_tcp_slot_table_entries);
-- 
cgit 


From 5ee0ed7d3ab620a764740fb018f469d45f561931 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:20 +0000
Subject: [PATCH] RPC: Make rpc_create_client() probe server for RPC
 program+version support

 Ensure that we don't create an RPC client without checking that the server
 does indeed support the RPC program + version that we are trying to set up.

 This enables us to immediately return an error to "mount" if it turns out
 that the server is only supporting NFSv2, when we requested NFSv3 or NFSv4.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c      | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-
 net/sunrpc/pmap_clnt.c |  2 +-
 2 files changed, 59 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 99515d7727a6..b36797ad8083 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -97,7 +97,7 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
  * made to sleep too long.
  */
 struct rpc_clnt *
-rpc_create_client(struct rpc_xprt *xprt, char *servname,
+rpc_new_client(struct rpc_xprt *xprt, char *servname,
 		  struct rpc_program *program, u32 vers,
 		  rpc_authflavor_t flavor)
 {
@@ -182,6 +182,36 @@ out_err:
 	return ERR_PTR(err);
 }
 
+/**
+ * Create an RPC client
+ * @xprt - pointer to xprt struct
+ * @servname - name of server
+ * @info - rpc_program
+ * @version - rpc_program version
+ * @authflavor - rpc_auth flavour to use
+ *
+ * Creates an RPC client structure, then pings the server in order to
+ * determine if it is up, and if it supports this program and version.
+ *
+ * This function should never be called by asynchronous tasks such as
+ * the portmapper.
+ */
+struct rpc_clnt *rpc_create_client(struct rpc_xprt *xprt, char *servname,
+		struct rpc_program *info, u32 version, rpc_authflavor_t authflavor)
+{
+	struct rpc_clnt *clnt;
+	int err;
+	
+	clnt = rpc_new_client(xprt, servname, info, version, authflavor);
+	if (IS_ERR(clnt))
+		return clnt;
+	err = rpc_ping(clnt, RPC_TASK_SOFT|RPC_TASK_NOINTR);
+	if (err == 0)
+		return clnt;
+	rpc_shutdown_client(clnt);
+	return ERR_PTR(err);
+}
+
 /*
  * This function clones the RPC client structure. It allows us to share the
  * same transport while varying parameters such as the authentication
@@ -1086,3 +1116,30 @@ out_overflow:
 	printk(KERN_WARNING "RPC %s: server reply was truncated.\n", __FUNCTION__);
 	goto out_retry;
 }
+
+static int rpcproc_encode_null(void *rqstp, u32 *data, void *obj)
+{
+	return 0;
+}
+
+static int rpcproc_decode_null(void *rqstp, u32 *data, void *obj)
+{
+	return 0;
+}
+
+static struct rpc_procinfo rpcproc_null = {
+	.p_encode = rpcproc_encode_null,
+	.p_decode = rpcproc_decode_null,
+};
+
+int rpc_ping(struct rpc_clnt *clnt, int flags)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &rpcproc_null,
+	};
+	int err;
+	msg.rpc_cred = authnull_ops.lookup_cred(NULL, NULL, 0);
+	err = rpc_call_sync(clnt, &msg, flags);
+	put_rpccred(msg.rpc_cred);
+	return err;
+}
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index 97c420ff1ee0..df4d84c9020d 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -207,7 +207,7 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto)
 	xprt->addr.sin_port = htons(RPC_PMAP_PORT);
 
 	/* printk("pmap: create clnt\n"); */
-	clnt = rpc_create_client(xprt, hostname,
+	clnt = rpc_new_client(xprt, hostname,
 				&pmap_program, RPC_PMAP_VERSION,
 				RPC_AUTH_UNIX);
 	if (!IS_ERR(clnt)) {
-- 
cgit 


From 96651ab341cde0fee940ec837f323d711cbfa7d5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:21 +0000
Subject: [PATCH] RPC: Shrink struct rpc_task by switching to wait_on_bit()

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/sched.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index cc298fa4b81d..2d9eb7fbd521 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -290,7 +290,7 @@ static void rpc_make_runnable(struct rpc_task *task)
 			return;
 		}
 	} else
-		wake_up(&task->u.tk_wait.waitq);
+		wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
 }
 
 /*
@@ -578,6 +578,14 @@ static inline int __rpc_do_exit(struct rpc_task *task)
 	return 1;
 }
 
+static int rpc_wait_bit_interruptible(void *word)
+{
+	if (signal_pending(current))
+		return -ERESTARTSYS;
+	schedule();
+	return 0;
+}
+
 /*
  * This is the RPC `scheduler' (or rather, the finite state machine).
  */
@@ -648,22 +656,21 @@ static int __rpc_execute(struct rpc_task *task)
 
 		/* sync task: sleep here */
 		dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid);
-		if (RPC_TASK_UNINTERRUPTIBLE(task)) {
-			__wait_event(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task));
-		} else {
-			__wait_event_interruptible(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task), status);
+		/* Note: Caller should be using rpc_clnt_sigmask() */
+		status = out_of_line_wait_on_bit(&task->tk_runstate,
+				RPC_TASK_QUEUED, rpc_wait_bit_interruptible,
+				TASK_INTERRUPTIBLE);
+		if (status == -ERESTARTSYS) {
 			/*
 			 * When a sync task receives a signal, it exits with
 			 * -ERESTARTSYS. In order to catch any callbacks that
 			 * clean up after sleeping on some queue, we don't
 			 * break the loop here, but go around once more.
 			 */
-			if (status == -ERESTARTSYS) {
-				dprintk("RPC: %4d got signal\n", task->tk_pid);
-				task->tk_flags |= RPC_TASK_KILLED;
-				rpc_exit(task, -ERESTARTSYS);
-				rpc_wake_up_task(task);
-			}
+			dprintk("RPC: %4d got signal\n", task->tk_pid);
+			task->tk_flags |= RPC_TASK_KILLED;
+			rpc_exit(task, -ERESTARTSYS);
+			rpc_wake_up_task(task);
 		}
 		rpc_set_running(task);
 		dprintk("RPC: %4d sync task resuming\n", task->tk_pid);
@@ -766,8 +773,6 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, rpc_action call
 
 	/* Initialize workqueue for async tasks */
 	task->tk_workqueue = rpciod_workqueue;
-	if (!RPC_IS_ASYNC(task))
-		init_waitqueue_head(&task->u.tk_wait.waitq);
 
 	if (clnt) {
 		atomic_inc(&clnt->cl_users);
-- 
cgit 


From 438b6fdebf2a2e8573e7290bc176feb4d4475f43 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 22 Jun 2005 17:16:23 +0000
Subject: [PATCH] RPC: Don't fall back from krb5p to krb5i

 We shouldn't be silently falling back from krb5p to krb5i.

 Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/auth_gss/auth_gss.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index a33b627cbef4..7d88db83ab12 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -675,9 +675,8 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
 		goto err_free;
 	}
 	gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor);
-	/* FIXME: Will go away once privacy support is merged in */
-	if (gss_auth->service == RPC_GSS_SVC_PRIVACY)
-		gss_auth->service = RPC_GSS_SVC_INTEGRITY;
+	if (gss_auth->service == 0)
+		goto err_put_mech;
 	INIT_LIST_HEAD(&gss_auth->upcalls);
 	spin_lock_init(&gss_auth->lock);
 	auth = &gss_auth->rpc_auth;
-- 
cgit 


From 6a19275ada9137435da58990c8f8d3f58e170bf1 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 22 Jun 2005 17:16:23 +0000
Subject: [PATCH] RPC: [PATCH] improve rpcauthauth_create error returns

 Currently we return -ENOMEM for every single failure to create a new auth.
 This is actually accurate for auth_null and auth_unix, but for auth_gss it's a
 bit confusing.

 Allow rpcauth_create (and the ->create methods) to return errors.  With this
 patch, the user may sometimes see an EINVAL instead.  Whee.

 Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/auth.c              |  6 +++---
 net/sunrpc/auth_gss/auth_gss.c | 13 +++++++++----
 net/sunrpc/clnt.c              |  6 ++++--
 3 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 9bcec9b927b9..505e2d4b3d62 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -66,10 +66,10 @@ rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt)
 	u32			flavor = pseudoflavor_to_flavor(pseudoflavor);
 
 	if (flavor >= RPC_AUTH_MAXFLAVOR || !(ops = auth_flavors[flavor]))
-		return NULL;
+		return ERR_PTR(-EINVAL);
 	auth = ops->create(clnt, pseudoflavor);
-	if (!auth)
-		return NULL;
+	if (IS_ERR(auth))
+		return auth;
 	if (clnt->cl_auth)
 		rpcauth_destroy(clnt->cl_auth);
 	clnt->cl_auth = auth;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 7d88db83ab12..2f7b867161d2 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -660,14 +660,16 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
 {
 	struct gss_auth *gss_auth;
 	struct rpc_auth * auth;
+	int err = -ENOMEM; /* XXX? */
 
 	dprintk("RPC:      creating GSS authenticator for client %p\n",clnt);
 
 	if (!try_module_get(THIS_MODULE))
-		return NULL;
+		return ERR_PTR(err);
 	if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL)))
 		goto out_dec;
 	gss_auth->client = clnt;
+	err = -EINVAL;
 	gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor);
 	if (!gss_auth->mech) {
 		printk(KERN_WARNING "%s: Pseudoflavor %d not found!",
@@ -686,15 +688,18 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
 	auth->au_flavor = flavor;
 	atomic_set(&auth->au_count, 1);
 
-	if (rpcauth_init_credcache(auth, GSS_CRED_EXPIRE) < 0)
+	err = rpcauth_init_credcache(auth, GSS_CRED_EXPIRE);
+	if (err)
 		goto err_put_mech;
 
 	snprintf(gss_auth->path, sizeof(gss_auth->path), "%s/%s",
 			clnt->cl_pathname,
 			gss_auth->mech->gm_name);
 	gss_auth->dentry = rpc_mkpipe(gss_auth->path, clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN);
-	if (IS_ERR(gss_auth->dentry))
+	if (IS_ERR(gss_auth->dentry)) {
+		err = PTR_ERR(gss_auth->dentry);
 		goto err_put_mech;
+	}
 
 	return auth;
 err_put_mech:
@@ -703,7 +708,7 @@ err_free:
 	kfree(gss_auth);
 out_dec:
 	module_put(THIS_MODULE);
-	return NULL;
+	return ERR_PTR(err);
 }
 
 static void
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b36797ad8083..9da1deb482e2 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -103,6 +103,7 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname,
 {
 	struct rpc_version	*version;
 	struct rpc_clnt		*clnt = NULL;
+	struct rpc_auth		*auth;
 	int err;
 	int len;
 
@@ -157,10 +158,11 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname,
 	if (err < 0)
 		goto out_no_path;
 
-	err = -ENOMEM;
-	if (!rpcauth_create(flavor, clnt)) {
+	auth = rpcauth_create(flavor, clnt);
+	if (IS_ERR(auth)) {
 		printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n",
 				flavor);
+		err = PTR_ERR(auth);
 		goto out_no_auth;
 	}
 
-- 
cgit 


From cdf477068e6db0c3e19df96f46abb85202de138c Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 22 Jun 2005 17:16:23 +0000
Subject: [PATCH] RPC: Return -EPFNOSUPPORT for RPC programs that are
 unavailable

 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Signed-off-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 9da1deb482e2..33f12b84e265 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1021,10 +1021,11 @@ call_verify(struct rpc_task *task)
 			case RPC_AUTH_ERROR:
 				break;
 			case RPC_MISMATCH:
-				printk(KERN_WARNING "%s: RPC call version mismatch!\n", __FUNCTION__);
-				goto out_eio;
+				dprintk("%s: RPC call version mismatch!\n", __FUNCTION__);
+				error = -EPROTONOSUPPORT;
+				goto out_err;
 			default:
-				printk(KERN_WARNING "%s: RPC call rejected, unknown error: %x\n", __FUNCTION__, n);
+				dprintk("%s: RPC call rejected, unknown error: %x\n", __FUNCTION__, n);
 				goto out_eio;
 		}
 		if (--len < 0)
@@ -1075,23 +1076,26 @@ call_verify(struct rpc_task *task)
 	case RPC_SUCCESS:
 		return p;
 	case RPC_PROG_UNAVAIL:
-		printk(KERN_WARNING "RPC: call_verify: program %u is unsupported by server %s\n",
+		dprintk("RPC: call_verify: program %u is unsupported by server %s\n",
 				(unsigned int)task->tk_client->cl_prog,
 				task->tk_client->cl_server);
-		goto out_eio;
+		error = -EPFNOSUPPORT;
+		goto out_err;
 	case RPC_PROG_MISMATCH:
-		printk(KERN_WARNING "RPC: call_verify: program %u, version %u unsupported by server %s\n",
+		dprintk("RPC: call_verify: program %u, version %u unsupported by server %s\n",
 				(unsigned int)task->tk_client->cl_prog,
 				(unsigned int)task->tk_client->cl_vers,
 				task->tk_client->cl_server);
-		goto out_eio;
+		error = -EPROTONOSUPPORT;
+		goto out_err;
 	case RPC_PROC_UNAVAIL:
-		printk(KERN_WARNING "RPC: call_verify: proc %p unsupported by program %u, version %u on server %s\n",
+		dprintk("RPC: call_verify: proc %p unsupported by program %u, version %u on server %s\n",
 				task->tk_msg.rpc_proc,
 				task->tk_client->cl_prog,
 				task->tk_client->cl_vers,
 				task->tk_client->cl_server);
-		goto out_eio;
+		error = -EOPNOTSUPP;
+		goto out_err;
 	case RPC_GARBAGE_ARGS:
 		dprintk("RPC: %4d %s: server saw garbage\n", task->tk_pid, __FUNCTION__);
 		break;			/* retry */
@@ -1104,7 +1108,7 @@ out_retry:
 	task->tk_client->cl_stats->rpcgarbage++;
 	if (task->tk_garb_retry) {
 		task->tk_garb_retry--;
-		dprintk(KERN_WARNING "RPC %s: retrying %4d\n", __FUNCTION__, task->tk_pid);
+		dprintk("RPC %s: retrying %4d\n", __FUNCTION__, task->tk_pid);
 		task->tk_action = call_bind;
 		return NULL;
 	}
-- 
cgit 


From 007e251f2b2760f738c92adc8c80cbae0bed3ce5 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 22 Jun 2005 17:16:23 +0000
Subject: [PATCH] RPC: Allow multiple RPC client programs to share the same
 transport

 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Acked-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c        | 40 ++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/pmap_clnt.c   |  3 +++
 net/sunrpc/sunrpc_syms.c |  1 +
 3 files changed, 44 insertions(+)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 33f12b84e265..c979fcf88798 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -241,6 +241,8 @@ rpc_clone_client(struct rpc_clnt *clnt)
 	rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval);
 	if (new->cl_auth)
 		atomic_inc(&new->cl_auth->au_count);
+	new->cl_pmap		= &new->cl_pmap_default;
+	rpc_init_wait_queue(&new->cl_pmap_default.pm_bindwait, "bindwait");
 	return new;
 out_no_clnt:
 	printk(KERN_INFO "RPC: out of memory in %s\n", __FUNCTION__);
@@ -329,6 +331,44 @@ rpc_release_client(struct rpc_clnt *clnt)
 		rpc_destroy_client(clnt);
 }
 
+/**
+ * rpc_bind_new_program - bind a new RPC program to an existing client
+ * @old - old rpc_client
+ * @program - rpc program to set
+ * @vers - rpc program version
+ *
+ * Clones the rpc client and sets up a new RPC program. This is mainly
+ * of use for enabling different RPC programs to share the same transport.
+ * The Sun NFSv2/v3 ACL protocol can do this.
+ */
+struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old,
+				      struct rpc_program *program,
+				      int vers)
+{
+	struct rpc_clnt *clnt;
+	struct rpc_version *version;
+	int err;
+
+	BUG_ON(vers >= program->nrvers || !program->version[vers]);
+	version = program->version[vers];
+	clnt = rpc_clone_client(old);
+	if (IS_ERR(clnt))
+		goto out;
+	clnt->cl_procinfo = version->procs;
+	clnt->cl_maxproc  = version->nrprocs;
+	clnt->cl_protname = program->name;
+	clnt->cl_prog     = program->number;
+	clnt->cl_vers     = version->number;
+	clnt->cl_stats    = program->stats;
+	err = rpc_ping(clnt, RPC_TASK_SOFT|RPC_TASK_NOINTR);
+	if (err != 0) {
+		rpc_shutdown_client(clnt);
+		clnt = ERR_PTR(err);
+	}
+out:	
+	return clnt;
+}
+
 /*
  * Default callback for async RPC calls
  */
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index df4d84c9020d..4e81f2766923 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -53,6 +53,9 @@ rpc_getport(struct rpc_task *task, struct rpc_clnt *clnt)
 			task->tk_pid, clnt->cl_server,
 			map->pm_prog, map->pm_vers, map->pm_prot);
 
+	/* Autobind on cloned rpc clients is discouraged */
+	BUG_ON(clnt->cl_parent != clnt);
+
 	spin_lock(&pmap_lock);
 	if (map->pm_binding) {
 		rpc_sleep_on(&map->pm_bindwait, task, NULL, NULL);
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 1b0ff7e0e869..d8673f66acc3 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -42,6 +42,7 @@ EXPORT_SYMBOL(rpc_release_task);
 /* RPC client functions */
 EXPORT_SYMBOL(rpc_create_client);
 EXPORT_SYMBOL(rpc_clone_client);
+EXPORT_SYMBOL(rpc_bind_new_program);
 EXPORT_SYMBOL(rpc_destroy_client);
 EXPORT_SYMBOL(rpc_shutdown_client);
 EXPORT_SYMBOL(rpc_release_client);
-- 
cgit 


From e053d1ab62c8ef0eff3dd4c95448cad3c6d2fbf4 Mon Sep 17 00:00:00 2001
From: Olaf Kirch <okir@suse.de>
Date: Wed, 22 Jun 2005 17:16:24 +0000
Subject: [PATCH] RPC: Lazy RPC receive buffer allocation

 Signed-off-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xdr.c  | 16 +++++++++++++---
 net/sunrpc/xprt.c | 26 ++++++++++++++++++++++----
 2 files changed, 35 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index f86d1baa6302..65b268d39782 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -176,7 +176,7 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
 	xdr->buflen += len;
 }
 
-void
+int
 xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 			  skb_reader_t *desc,
 			  skb_read_actor_t copy_actor)
@@ -190,7 +190,7 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 		len -= base;
 		ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len);
 		if (ret != len || !desc->count)
-			return;
+			return 0;
 		base = 0;
 	} else
 		base -= len;
@@ -210,6 +210,14 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 	do {
 		char *kaddr;
 
+		/* ACL likes to be lazy in allocating pages - ACLs
+		 * are small by default but can get huge. */
+		if (unlikely(*ppage == NULL)) {
+			*ppage = alloc_page(GFP_ATOMIC);
+			if (unlikely(*ppage == NULL))
+				return -ENOMEM;
+		}
+
 		len = PAGE_CACHE_SIZE;
 		kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA);
 		if (base) {
@@ -226,13 +234,15 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 		flush_dcache_page(*ppage);
 		kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA);
 		if (ret != len || !desc->count)
-			return;
+			return 0;
 		ppage++;
 	} while ((pglen -= len) != 0);
 copy_tail:
 	len = xdr->tail[0].iov_len;
 	if (base < len)
 		copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
+
+	return 0;
 }
 
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index c74a6bb94074..a180ed4952d6 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -725,7 +725,8 @@ csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
 		goto no_checksum;
 
 	desc.csum = csum_partial(skb->data, desc.offset, skb->csum);
-	xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits);
+	if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits) < 0)
+		return -1;
 	if (desc.offset != skb->len) {
 		unsigned int csum2;
 		csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0);
@@ -737,7 +738,8 @@ csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
 		return -1;
 	return 0;
 no_checksum:
-	xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits);
+	if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0)
+		return -1;
 	if (desc.count)
 		return -1;
 	return 0;
@@ -907,6 +909,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 	struct rpc_rqst *req;
 	struct xdr_buf *rcvbuf;
 	size_t len;
+	int r;
 
 	/* Find and lock the request corresponding to this xid */
 	spin_lock(&xprt->sock_lock);
@@ -927,16 +930,30 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 		len = xprt->tcp_reclen - xprt->tcp_offset;
 		memcpy(&my_desc, desc, sizeof(my_desc));
 		my_desc.count = len;
-		xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
+		r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
 					  &my_desc, tcp_copy_data);
 		desc->count -= len;
 		desc->offset += len;
 	} else
-		xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
+		r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
 					  desc, tcp_copy_data);
 	xprt->tcp_copied += len;
 	xprt->tcp_offset += len;
 
+	if (r < 0) {
+		/* Error when copying to the receive buffer,
+		 * usually because we weren't able to allocate
+		 * additional buffer pages. All we can do now
+		 * is turn off XPRT_COPY_DATA, so the request
+		 * will not receive any additional updates,
+		 * and time out.
+		 * Any remaining data from this record will
+		 * be discarded.
+		 */
+		xprt->tcp_flags &= ~XPRT_COPY_DATA;
+		goto out;
+	}
+
 	if (xprt->tcp_copied == req->rq_private_buf.buflen)
 		xprt->tcp_flags &= ~XPRT_COPY_DATA;
 	else if (xprt->tcp_offset == xprt->tcp_reclen) {
@@ -949,6 +966,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 				req->rq_task->tk_pid);
 		xprt_complete_rqst(xprt, req, xprt->tcp_copied);
 	}
+out:
 	spin_unlock(&xprt->sock_lock);
 	tcp_check_recm(xprt);
 }
-- 
cgit 


From 7e06b53d796a3740307b54aa2799077f8a0c84e7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:24 +0000
Subject: [PATCH] RPC: fix accounting bug in the case of a truncated RPC
 message

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xdr.c  | 22 ++++++++++++++--------
 net/sunrpc/xprt.c | 35 +++++++++++++++++++++++++++--------
 2 files changed, 41 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 65b268d39782..b3ac3f72bf9c 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -176,21 +176,23 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
 	xdr->buflen += len;
 }
 
-int
+ssize_t
 xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 			  skb_reader_t *desc,
 			  skb_read_actor_t copy_actor)
 {
 	struct page	**ppage = xdr->pages;
 	unsigned int	len, pglen = xdr->page_len;
+	ssize_t		copied = 0;
 	int		ret;
 
 	len = xdr->head[0].iov_len;
 	if (base < len) {
 		len -= base;
 		ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len);
+		copied += ret;
 		if (ret != len || !desc->count)
-			return 0;
+			goto out;
 		base = 0;
 	} else
 		base -= len;
@@ -214,8 +216,11 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 		 * are small by default but can get huge. */
 		if (unlikely(*ppage == NULL)) {
 			*ppage = alloc_page(GFP_ATOMIC);
-			if (unlikely(*ppage == NULL))
-				return -ENOMEM;
+			if (unlikely(*ppage == NULL)) {
+				if (copied == 0)
+					copied = -ENOMEM;
+				goto out;
+			}
 		}
 
 		len = PAGE_CACHE_SIZE;
@@ -233,16 +238,17 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 		}
 		flush_dcache_page(*ppage);
 		kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA);
+		copied += ret;
 		if (ret != len || !desc->count)
-			return 0;
+			goto out;
 		ppage++;
 	} while ((pglen -= len) != 0);
 copy_tail:
 	len = xdr->tail[0].iov_len;
 	if (base < len)
-		copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
-
-	return 0;
+		copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
+out:
+	return copied;
 }
 
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index a180ed4952d6..ef941e7de8bf 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -823,10 +823,15 @@ tcp_copy_data(skb_reader_t *desc, void *p, size_t len)
 {
 	if (len > desc->count)
 		len = desc->count;
-	if (skb_copy_bits(desc->skb, desc->offset, p, len))
+	if (skb_copy_bits(desc->skb, desc->offset, p, len)) {
+		dprintk("RPC:      failed to copy %zu bytes from skb. %zu bytes remain\n",
+				len, desc->count);
 		return 0;
+	}
 	desc->offset += len;
 	desc->count -= len;
+	dprintk("RPC:      copied %zu bytes from skb. %zu bytes remain\n",
+			len, desc->count);
 	return len;
 }
 
@@ -865,6 +870,8 @@ tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc)
 static void
 tcp_check_recm(struct rpc_xprt *xprt)
 {
+	dprintk("RPC:      xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n",
+			xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags);
 	if (xprt->tcp_offset == xprt->tcp_reclen) {
 		xprt->tcp_flags |= XPRT_COPY_RECM;
 		xprt->tcp_offset = 0;
@@ -909,7 +916,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 	struct rpc_rqst *req;
 	struct xdr_buf *rcvbuf;
 	size_t len;
-	int r;
+	ssize_t r;
 
 	/* Find and lock the request corresponding to this xid */
 	spin_lock(&xprt->sock_lock);
@@ -932,15 +939,17 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 		my_desc.count = len;
 		r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
 					  &my_desc, tcp_copy_data);
-		desc->count -= len;
-		desc->offset += len;
+		desc->count -= r;
+		desc->offset += r;
 	} else
 		r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
 					  desc, tcp_copy_data);
-	xprt->tcp_copied += len;
-	xprt->tcp_offset += len;
 
-	if (r < 0) {
+	if (r > 0) {
+		xprt->tcp_copied += r;
+		xprt->tcp_offset += r;
+	}
+	if (r != len) {
 		/* Error when copying to the receive buffer,
 		 * usually because we weren't able to allocate
 		 * additional buffer pages. All we can do now
@@ -951,9 +960,18 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 		 * be discarded.
 		 */
 		xprt->tcp_flags &= ~XPRT_COPY_DATA;
+		dprintk("RPC:      XID %08x truncated request\n",
+				ntohl(xprt->tcp_xid));
+		dprintk("RPC:      xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
+				xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
 		goto out;
 	}
 
+	dprintk("RPC:      XID %08x read %u bytes\n",
+			ntohl(xprt->tcp_xid), r);
+	dprintk("RPC:      xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
+			xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
+
 	if (xprt->tcp_copied == req->rq_private_buf.buflen)
 		xprt->tcp_flags &= ~XPRT_COPY_DATA;
 	else if (xprt->tcp_offset == xprt->tcp_reclen) {
@@ -961,12 +979,12 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 			xprt->tcp_flags &= ~XPRT_COPY_DATA;
 	}
 
+out:
 	if (!(xprt->tcp_flags & XPRT_COPY_DATA)) {
 		dprintk("RPC: %4d received reply complete\n",
 				req->rq_task->tk_pid);
 		xprt_complete_rqst(xprt, req, xprt->tcp_copied);
 	}
-out:
 	spin_unlock(&xprt->sock_lock);
 	tcp_check_recm(xprt);
 }
@@ -985,6 +1003,7 @@ tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
 	desc->count -= len;
 	desc->offset += len;
 	xprt->tcp_offset += len;
+	dprintk("RPC:      discarded %u bytes\n", len);
 	tcp_check_recm(xprt);
 }
 
-- 
cgit 


From bd8100e7eda87507649c6ba4cb32173b34e49986 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 22 Jun 2005 17:16:24 +0000
Subject: [PATCH] RPC: Encode and decode arbitrary XDR arrays

 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Acked-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/sunrpc_syms.c |   4 +
 net/sunrpc/xdr.c         | 256 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 257 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index d8673f66acc3..32e8acbc60fe 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -129,6 +129,10 @@ EXPORT_SYMBOL(xdr_encode_netobj);
 EXPORT_SYMBOL(xdr_encode_pages);
 EXPORT_SYMBOL(xdr_inline_pages);
 EXPORT_SYMBOL(xdr_shift_buf);
+EXPORT_SYMBOL(xdr_encode_word);
+EXPORT_SYMBOL(xdr_decode_word);
+EXPORT_SYMBOL(xdr_encode_array2);
+EXPORT_SYMBOL(xdr_decode_array2);
 EXPORT_SYMBOL(xdr_buf_from_iov);
 EXPORT_SYMBOL(xdr_buf_subsegment);
 EXPORT_SYMBOL(xdr_buf_read_netobj);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index b3ac3f72bf9c..8a4d9c106af1 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -887,8 +887,34 @@ out:
 	return status;
 }
 
-static int
-read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj)
+/* obj is assumed to point to allocated memory of size at least len: */
+int
+write_bytes_to_xdr_buf(struct xdr_buf *buf, int base, void *obj, int len)
+{
+	struct xdr_buf subbuf;
+	int this_len;
+	int status;
+
+	status = xdr_buf_subsegment(buf, &subbuf, base, len);
+	if (status)
+		goto out;
+	this_len = min(len, (int)subbuf.head[0].iov_len);
+	memcpy(subbuf.head[0].iov_base, obj, this_len);
+	len -= this_len;
+	obj += this_len;
+	this_len = min(len, (int)subbuf.page_len);
+	if (this_len)
+		_copy_to_pages(subbuf.pages, subbuf.page_base, obj, this_len);
+	len -= this_len;
+	obj += this_len;
+	this_len = min(len, (int)subbuf.tail[0].iov_len);
+	memcpy(subbuf.tail[0].iov_base, obj, this_len);
+out:
+	return status;
+}
+
+int
+xdr_decode_word(struct xdr_buf *buf, int base, u32 *obj)
 {
 	u32	raw;
 	int	status;
@@ -900,6 +926,14 @@ read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj)
 	return 0;
 }
 
+int
+xdr_encode_word(struct xdr_buf *buf, int base, u32 obj)
+{
+	u32	raw = htonl(obj);
+
+	return write_bytes_to_xdr_buf(buf, base, &raw, sizeof(obj));
+}
+
 /* If the netobj starting offset bytes from the start of xdr_buf is contained
  * entirely in the head or the tail, set object to point to it; otherwise
  * try to find space for it at the end of the tail, copy it there, and
@@ -910,7 +944,7 @@ xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, int offset)
 	u32	tail_offset = buf->head[0].iov_len + buf->page_len;
 	u32	obj_end_offset;
 
-	if (read_u32_from_xdr_buf(buf, offset, &obj->len))
+	if (xdr_decode_word(buf, offset, &obj->len))
 		goto out;
 	obj_end_offset = offset + 4 + obj->len;
 
@@ -943,3 +977,219 @@ xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, int offset)
 out:
 	return -1;
 }
+
+/* Returns 0 on success, or else a negative error code. */
+static int
+xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
+		 struct xdr_array2_desc *desc, int encode)
+{
+	char *elem = NULL, *c;
+	unsigned int copied = 0, todo, avail_here;
+	struct page **ppages = NULL;
+	int err;
+
+	if (encode) {
+		if (xdr_encode_word(buf, base, desc->array_len) != 0)
+			return -EINVAL;
+	} else {
+		if (xdr_decode_word(buf, base, &desc->array_len) != 0 ||
+		    (unsigned long) base + 4 + desc->array_len *
+				    desc->elem_size > buf->len)
+			return -EINVAL;
+	}
+	base += 4;
+
+	if (!desc->xcode)
+		return 0;
+
+	todo = desc->array_len * desc->elem_size;
+
+	/* process head */
+	if (todo && base < buf->head->iov_len) {
+		c = buf->head->iov_base + base;
+		avail_here = min_t(unsigned int, todo,
+				   buf->head->iov_len - base);
+		todo -= avail_here;
+
+		while (avail_here >= desc->elem_size) {
+			err = desc->xcode(desc, c);
+			if (err)
+				goto out;
+			c += desc->elem_size;
+			avail_here -= desc->elem_size;
+		}
+		if (avail_here) {
+			if (!elem) {
+				elem = kmalloc(desc->elem_size, GFP_KERNEL);
+				err = -ENOMEM;
+				if (!elem)
+					goto out;
+			}
+			if (encode) {
+				err = desc->xcode(desc, elem);
+				if (err)
+					goto out;
+				memcpy(c, elem, avail_here);
+			} else
+				memcpy(elem, c, avail_here);
+			copied = avail_here;
+		}
+		base = buf->head->iov_len;  /* align to start of pages */
+	}
+
+	/* process pages array */
+	base -= buf->head->iov_len;
+	if (todo && base < buf->page_len) {
+		unsigned int avail_page;
+
+		avail_here = min(todo, buf->page_len - base);
+		todo -= avail_here;
+
+		base += buf->page_base;
+		ppages = buf->pages + (base >> PAGE_CACHE_SHIFT);
+		base &= ~PAGE_CACHE_MASK;
+		avail_page = min_t(unsigned int, PAGE_CACHE_SIZE - base,
+					avail_here);
+		c = kmap(*ppages) + base;
+
+		while (avail_here) {
+			avail_here -= avail_page;
+			if (copied || avail_page < desc->elem_size) {
+				unsigned int l = min(avail_page,
+					desc->elem_size - copied);
+				if (!elem) {
+					elem = kmalloc(desc->elem_size,
+						       GFP_KERNEL);
+					err = -ENOMEM;
+					if (!elem)
+						goto out;
+				}
+				if (encode) {
+					if (!copied) {
+						err = desc->xcode(desc, elem);
+						if (err)
+							goto out;
+					}
+					memcpy(c, elem + copied, l);
+					copied += l;
+					if (copied == desc->elem_size)
+						copied = 0;
+				} else {
+					memcpy(elem + copied, c, l);
+					copied += l;
+					if (copied == desc->elem_size) {
+						err = desc->xcode(desc, elem);
+						if (err)
+							goto out;
+						copied = 0;
+					}
+				}
+				avail_page -= l;
+				c += l;
+			}
+			while (avail_page >= desc->elem_size) {
+				err = desc->xcode(desc, c);
+				if (err)
+					goto out;
+				c += desc->elem_size;
+				avail_page -= desc->elem_size;
+			}
+			if (avail_page) {
+				unsigned int l = min(avail_page,
+					    desc->elem_size - copied);
+				if (!elem) {
+					elem = kmalloc(desc->elem_size,
+						       GFP_KERNEL);
+					err = -ENOMEM;
+					if (!elem)
+						goto out;
+				}
+				if (encode) {
+					if (!copied) {
+						err = desc->xcode(desc, elem);
+						if (err)
+							goto out;
+					}
+					memcpy(c, elem + copied, l);
+					copied += l;
+					if (copied == desc->elem_size)
+						copied = 0;
+				} else {
+					memcpy(elem + copied, c, l);
+					copied += l;
+					if (copied == desc->elem_size) {
+						err = desc->xcode(desc, elem);
+						if (err)
+							goto out;
+						copied = 0;
+					}
+				}
+			}
+			if (avail_here) {
+				kunmap(*ppages);
+				ppages++;
+				c = kmap(*ppages);
+			}
+
+			avail_page = min(avail_here,
+				 (unsigned int) PAGE_CACHE_SIZE);
+		}
+		base = buf->page_len;  /* align to start of tail */
+	}
+
+	/* process tail */
+	base -= buf->page_len;
+	if (todo) {
+		c = buf->tail->iov_base + base;
+		if (copied) {
+			unsigned int l = desc->elem_size - copied;
+
+			if (encode)
+				memcpy(c, elem + copied, l);
+			else {
+				memcpy(elem + copied, c, l);
+				err = desc->xcode(desc, elem);
+				if (err)
+					goto out;
+			}
+			todo -= l;
+			c += l;
+		}
+		while (todo) {
+			err = desc->xcode(desc, c);
+			if (err)
+				goto out;
+			c += desc->elem_size;
+			todo -= desc->elem_size;
+		}
+	}
+	err = 0;
+
+out:
+	if (elem)
+		kfree(elem);
+	if (ppages)
+		kunmap(*ppages);
+	return err;
+}
+
+int
+xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
+		  struct xdr_array2_desc *desc)
+{
+	if (base >= buf->len)
+		return -EINVAL;
+
+	return xdr_xcode_array2(buf, base, desc, 0);
+}
+
+int
+xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
+		  struct xdr_array2_desc *desc)
+{
+	if ((unsigned long) base + 4 + desc->array_len * desc->elem_size >
+	    buf->head->iov_len + buf->page_len + buf->tail->iov_len)
+		return -EINVAL;
+
+	return xdr_xcode_array2(buf, base, desc, 1);
+}
-- 
cgit 


From 9ba02638e4be28dd4ff724202a640264427c62d1 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 22 Jun 2005 17:16:24 +0000
Subject: [PATCH] RPC: Allow the sunrpc server to multiplex serveral programs
 on a single port

 The NFS and NFSACL programs run on the same RPC transport.  This patch adds
 support for this by converting svc_program into a chained list of programs
 (server-side).

 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Signed-off-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Andrew Morton <akpm@osdl.org>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/svc.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index a02d424a7409..e9bd91265f70 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -35,20 +35,24 @@ svc_create(struct svc_program *prog, unsigned int bufsize)
 	if (!(serv = (struct svc_serv *) kmalloc(sizeof(*serv), GFP_KERNEL)))
 		return NULL;
 	memset(serv, 0, sizeof(*serv));
+	serv->sv_name      = prog->pg_name;
 	serv->sv_program   = prog;
 	serv->sv_nrthreads = 1;
 	serv->sv_stats     = prog->pg_stats;
 	serv->sv_bufsz	   = bufsize? bufsize : 4096;
-	prog->pg_lovers = prog->pg_nvers-1;
 	xdrsize = 0;
-	for (vers=0; vers<prog->pg_nvers ; vers++)
-		if (prog->pg_vers[vers]) {
-			prog->pg_hivers = vers;
-			if (prog->pg_lovers > vers)
-				prog->pg_lovers = vers;
-			if (prog->pg_vers[vers]->vs_xdrsize > xdrsize)
-				xdrsize = prog->pg_vers[vers]->vs_xdrsize;
-		}
+	while (prog) {
+		prog->pg_lovers = prog->pg_nvers-1;
+		for (vers=0; vers<prog->pg_nvers ; vers++)
+			if (prog->pg_vers[vers]) {
+				prog->pg_hivers = vers;
+				if (prog->pg_lovers > vers)
+					prog->pg_lovers = vers;
+				if (prog->pg_vers[vers]->vs_xdrsize > xdrsize)
+					xdrsize = prog->pg_vers[vers]->vs_xdrsize;
+			}
+		prog = prog->pg_next;
+	}
 	serv->sv_xdrsize   = xdrsize;
 	INIT_LIST_HEAD(&serv->sv_threads);
 	INIT_LIST_HEAD(&serv->sv_sockets);
@@ -56,8 +60,6 @@ svc_create(struct svc_program *prog, unsigned int bufsize)
 	INIT_LIST_HEAD(&serv->sv_permsocks);
 	spin_lock_init(&serv->sv_lock);
 
-	serv->sv_name      = prog->pg_name;
-
 	/* Remove any stale portmap registrations */
 	svc_register(serv, 0, 0);
 
@@ -339,7 +341,10 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
 		goto sendit;
 	}
 		
-	if (prog != progp->pg_prog)
+	for (progp = serv->sv_program; progp; progp = progp->pg_next)
+		if (prog == progp->pg_prog)
+			break;
+	if (progp == NULL)
 		goto err_bad_prog;
 
 	if (vers >= progp->pg_nvers ||
@@ -452,11 +457,7 @@ err_bad_auth:
 	goto sendit;
 
 err_bad_prog:
-#ifdef RPC_PARANOIA
-	if (prog != 100227 || progp->pg_prog != 100003)
-		printk("svc: unknown program %d (me %d)\n", prog, progp->pg_prog);
-	/* else it is just a Solaris client seeing if ACLs are supported */
-#endif
+	dprintk("svc: unknown program %d\n", prog);
 	serv->sv_stats->rpcbadfmt++;
 	svc_putu32(resv, rpc_prog_unavail);
 	goto sendit;
-- 
cgit 


From 14b218a8e4f110206c46e586a3da372f665631e7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:28 +0000
Subject: [PATCH] RPC: Ensure rpc calls respects the RPC_NOINTR flag

 For internal purposes, the rpc_clnt_sigmask() call is replaced by
 a call to rpc_task_sigmask(), which ensures that the current task
 sigmask respects both the client cl_intr flag and the per-task NOINTR flag.

 Problem noted by Jiaying Zhang.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 71 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index c979fcf88798..f17e6153b688 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -378,38 +378,41 @@ rpc_default_callback(struct rpc_task *task)
 }
 
 /*
- *	Export the signal mask handling for aysnchronous code that
+ *	Export the signal mask handling for synchronous code that
  *	sleeps on RPC calls
  */
+#define RPC_INTR_SIGNALS (sigmask(SIGINT) | sigmask(SIGQUIT) | sigmask(SIGKILL))
  
+static void rpc_save_sigmask(sigset_t *oldset, int intr)
+{
+	unsigned long	sigallow = 0;
+	sigset_t sigmask;
+
+	/* Block all signals except those listed in sigallow */
+	if (intr)
+		sigallow |= RPC_INTR_SIGNALS;
+	siginitsetinv(&sigmask, sigallow);
+	sigprocmask(SIG_BLOCK, &sigmask, oldset);
+}
+
+static inline void rpc_task_sigmask(struct rpc_task *task, sigset_t *oldset)
+{
+	rpc_save_sigmask(oldset, !RPC_TASK_UNINTERRUPTIBLE(task));
+}
+
+static inline void rpc_restore_sigmask(sigset_t *oldset)
+{
+	sigprocmask(SIG_SETMASK, oldset, NULL);
+}
+
 void rpc_clnt_sigmask(struct rpc_clnt *clnt, sigset_t *oldset)
 {
-	unsigned long	sigallow = sigmask(SIGKILL);
-	unsigned long	irqflags;
-	
-	/* Turn off various signals */
-	if (clnt->cl_intr) {
-		struct k_sigaction *action = current->sighand->action;
-		if (action[SIGINT-1].sa.sa_handler == SIG_DFL)
-			sigallow |= sigmask(SIGINT);
-		if (action[SIGQUIT-1].sa.sa_handler == SIG_DFL)
-			sigallow |= sigmask(SIGQUIT);
-	}
-	spin_lock_irqsave(&current->sighand->siglock, irqflags);
-	*oldset = current->blocked;
-	siginitsetinv(&current->blocked, sigallow & ~oldset->sig[0]);
-	recalc_sigpending();
-	spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
+	rpc_save_sigmask(oldset, clnt->cl_intr);
 }
 
 void rpc_clnt_sigunmask(struct rpc_clnt *clnt, sigset_t *oldset)
 {
-	unsigned long	irqflags;
-	
-	spin_lock_irqsave(&current->sighand->siglock, irqflags);
-	current->blocked = *oldset;
-	recalc_sigpending();
-	spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
+	rpc_restore_sigmask(oldset);
 }
 
 /*
@@ -427,26 +430,26 @@ int rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 
 	BUG_ON(flags & RPC_TASK_ASYNC);
 
-	rpc_clnt_sigmask(clnt, &oldset);		
-
 	status = -ENOMEM;
 	task = rpc_new_task(clnt, NULL, flags);
 	if (task == NULL)
 		goto out;
 
+	/* Mask signals on RPC calls _and_ GSS_AUTH upcalls */
+	rpc_task_sigmask(task, &oldset);
+
 	rpc_call_setup(task, msg, 0);
 
 	/* Set up the call info struct and execute the task */
-	if (task->tk_status == 0)
+	if (task->tk_status == 0) {
 		status = rpc_execute(task);
-	else {
+	} else {
 		status = task->tk_status;
 		rpc_release_task(task);
 	}
 
+	rpc_restore_sigmask(&oldset);
 out:
-	rpc_clnt_sigunmask(clnt, &oldset);		
-
 	return status;
 }
 
@@ -467,8 +470,6 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
 
 	flags |= RPC_TASK_ASYNC;
 
-	rpc_clnt_sigmask(clnt, &oldset);		
-
 	/* Create/initialize a new RPC task */
 	if (!callback)
 		callback = rpc_default_callback;
@@ -477,6 +478,9 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
 		goto out;
 	task->tk_calldata = data;
 
+	/* Mask signals on GSS_AUTH upcalls */
+	rpc_task_sigmask(task, &oldset);		
+
 	rpc_call_setup(task, msg, 0);
 
 	/* Set up the call info struct and execute the task */
@@ -486,9 +490,8 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
 	else
 		rpc_release_task(task);
 
+	rpc_restore_sigmask(&oldset);		
 out:
-	rpc_clnt_sigunmask(clnt, &oldset);		
-
 	return status;
 }
 
@@ -666,7 +669,7 @@ call_allocate(struct rpc_task *task)
 		return;
 	printk(KERN_INFO "RPC: buffer allocation failed for task %p\n", task); 
 
-	if (RPC_IS_ASYNC(task) || !(task->tk_client->cl_intr && signalled())) {
+	if (RPC_IS_ASYNC(task) || !signalled()) {
 		xprt_release(task);
 		task->tk_action = call_reserve;
 		rpc_delay(task, HZ>>4);
-- 
cgit 


From 0f9dc2b16884bb5957d010ed8e9114e771a05916 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:28 +0000
Subject: [PATCH] RPC: Clean up socket autodisconnect

 Cancel autodisconnect requests inside xprt_transmit() in order to avoid
 races.
 Use more efficient del_singleshot_timer_sync()

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index ef941e7de8bf..a74a1289113e 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1240,6 +1240,8 @@ xprt_transmit(struct rpc_task *task)
 			list_add_tail(&req->rq_list, &xprt->recv);
 			spin_unlock_bh(&xprt->sock_lock);
 			xprt_reset_majortimeo(req);
+			/* Turn off autodisconnect */
+			del_singleshot_timer_sync(&xprt->timer);
 		}
 	} else if (!req->rq_bytes_sent)
 		return;
@@ -1370,8 +1372,6 @@ xprt_reserve(struct rpc_task *task)
 		spin_lock(&xprt->xprt_lock);
 		do_xprt_reserve(task);
 		spin_unlock(&xprt->xprt_lock);
-		if (task->tk_rqstp)
-			del_timer_sync(&xprt->timer);
 	}
 }
 
-- 
cgit 


From 20e5ac828dfd23b9080159c62a34f32d2dcd92fc Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Wed, 22 Jun 2005 17:16:28 +0000
Subject: [PATCH] RPC: TCP reconnects are too slow

 When the network layer reports a connection close, the RPC task
 waiting to reconnect should be notified so it can retry immediately
 instead of waiting for the normal connection establishment timeout.

 This reverts a change made in 2.6.6 as part of adding client support
 for RPC over TCP socket idle timeouts.

 Test-plan:
 Destructive testing with NFS over TCP mounts.

 Version: Fri, 29 Apr 2005 15:31:46 -0400

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index a74a1289113e..2b8789cf8db1 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1101,8 +1101,7 @@ tcp_state_change(struct sock *sk)
 	case TCP_SYN_RECV:
 		break;
 	default:
-		if (xprt_test_and_clear_connected(xprt))
-			rpc_wake_up_status(&xprt->pending, -ENOTCONN);
+		xprt_disconnect(xprt);
 		break;
 	}
  out:
-- 
cgit 


From ae3884621bf5b4caff7785b9a417f262202965b2 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Wed, 22 Jun 2005 17:16:28 +0000
Subject: [PATCH] RPC: kick off socket connect operations faster

 Make the socket transport kick the event queue to start socket connects
 immediately.  This should improve responsiveness of applications that are
 sensitive to slow mount operations (like automounters).

 We are now also careful to cancel the connect worker before destroying
 the xprt.  This eliminates a race where xprt_destroy can finish before
 the connect worker is even allowed to run.

 Test-plan:
 Destructive testing (unplugging the network temporarily).  Connectathon
 with UDP and TCP.  Hard-code impossibly small connect timeout.

 Version: Fri, 29 Apr 2005 15:32:01 -0400

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprt.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 2b8789cf8db1..eca92405948f 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -569,8 +569,11 @@ void xprt_connect(struct rpc_task *task)
 		if (xprt->sock != NULL)
 			schedule_delayed_work(&xprt->sock_connect,
 					RPC_REESTABLISH_TIMEOUT);
-		else
+		else {
 			schedule_work(&xprt->sock_connect);
+			if (!RPC_IS_ASYNC(task))
+				flush_scheduled_work();
+		}
 	}
 	return;
  out_write:
@@ -1685,6 +1688,10 @@ xprt_shutdown(struct rpc_xprt *xprt)
 	rpc_wake_up(&xprt->backlog);
 	wake_up(&xprt->cong_wait);
 	del_timer_sync(&xprt->timer);
+
+	/* synchronously wait for connect worker to finish */
+	cancel_delayed_work(&xprt->sock_connect);
+	flush_scheduled_work();
 }
 
 /*
-- 
cgit 


From f31f5f051269746179b01017fc5e3dcf6b37c67e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 22 Jun 2005 14:32:51 -0700
Subject: [NET]: dont use strlen() but the result from a prior sprintf()

Small patch to save an unecessary call to strlen() : sprintf() gave us
the length, just trust it.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index 38729af09461..6f2a17881972 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -383,9 +383,8 @@ int sock_map_fd(struct socket *sock)
 			goto out;
 		}
 
-		sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino);
+		this.len = sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino);
 		this.name = name;
-		this.len = strlen(name);
 		this.hash = SOCK_INODE(sock)->i_ino;
 
 		file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);
-- 
cgit 


From 115c1d6e61b70851d9a363328c3b8d4c2559a1d3 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 22 Jun 2005 22:05:31 -0700
Subject: [NETPOLL]: Introduce a netpoll_info struct

This patch introduces a netpoll_info structure, which the struct net_device
will now point to instead of pointing to a struct netpoll.  The reason for
this is two-fold: 1) fields such as the rx_flags, poll_owner, and poll_lock
should be maintained per net_device, not per netpoll;  and 2) this is a first
step in providing support for multiple netpoll clients to register against the
same net_device.

The struct netpoll is now pointed to by the netpoll_info structure.  As
such, the previous behaviour of the code is preserved.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 57 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a119696d5521..ab3c0c9713b0 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -130,19 +130,20 @@ static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
  */
 static void poll_napi(struct netpoll *np)
 {
+	struct netpoll_info *npinfo = np->dev->npinfo;
 	int budget = 16;
 
 	if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
-	    np->poll_owner != smp_processor_id() &&
-	    spin_trylock(&np->poll_lock)) {
-		np->rx_flags |= NETPOLL_RX_DROP;
+	    npinfo->poll_owner != smp_processor_id() &&
+	    spin_trylock(&npinfo->poll_lock)) {
+		npinfo->rx_flags |= NETPOLL_RX_DROP;
 		atomic_inc(&trapped);
 
 		np->dev->poll(np->dev, &budget);
 
 		atomic_dec(&trapped);
-		np->rx_flags &= ~NETPOLL_RX_DROP;
-		spin_unlock(&np->poll_lock);
+		npinfo->rx_flags &= ~NETPOLL_RX_DROP;
+		spin_unlock(&npinfo->poll_lock);
 	}
 }
 
@@ -245,6 +246,7 @@ repeat:
 static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 {
 	int status;
+	struct netpoll_info *npinfo;
 
 repeat:
 	if(!np || !np->dev || !netif_running(np->dev)) {
@@ -253,8 +255,9 @@ repeat:
 	}
 
 	/* avoid recursion */
-	if(np->poll_owner == smp_processor_id() ||
-	   np->dev->xmit_lock_owner == smp_processor_id()) {
+	npinfo = np->dev->npinfo;
+	if (npinfo->poll_owner == smp_processor_id() ||
+	    np->dev->xmit_lock_owner == smp_processor_id()) {
 		if (np->drop)
 			np->drop(skb);
 		else
@@ -341,14 +344,18 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 
 static void arp_reply(struct sk_buff *skb)
 {
+	struct netpoll_info *npinfo = skb->dev->npinfo;
 	struct arphdr *arp;
 	unsigned char *arp_ptr;
 	int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
 	u32 sip, tip;
 	struct sk_buff *send_skb;
-	struct netpoll *np = skb->dev->np;
+	struct netpoll *np = NULL;
 
-	if (!np) return;
+	if (npinfo)
+		np = npinfo->np;
+	if (!np)
+		return;
 
 	/* No arp on this interface */
 	if (skb->dev->flags & IFF_NOARP)
@@ -429,7 +436,7 @@ int __netpoll_rx(struct sk_buff *skb)
 	int proto, len, ulen;
 	struct iphdr *iph;
 	struct udphdr *uh;
-	struct netpoll *np = skb->dev->np;
+	struct netpoll *np = skb->dev->npinfo->np;
 
 	if (!np->rx_hook)
 		goto out;
@@ -611,9 +618,7 @@ int netpoll_setup(struct netpoll *np)
 {
 	struct net_device *ndev = NULL;
 	struct in_device *in_dev;
-
-	np->poll_lock = SPIN_LOCK_UNLOCKED;
-	np->poll_owner = -1;
+	struct netpoll_info *npinfo;
 
 	if (np->dev_name)
 		ndev = dev_get_by_name(np->dev_name);
@@ -624,7 +629,16 @@ int netpoll_setup(struct netpoll *np)
 	}
 
 	np->dev = ndev;
-	ndev->np = np;
+	if (!ndev->npinfo) {
+		npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
+		if (!npinfo)
+			goto release;
+
+		npinfo->np = NULL;
+		npinfo->poll_lock = SPIN_LOCK_UNLOCKED;
+		npinfo->poll_owner = -1;
+	} else
+		npinfo = ndev->npinfo;
 
 	if (!ndev->poll_controller) {
 		printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
@@ -693,12 +707,15 @@ int netpoll_setup(struct netpoll *np)
 	}
 
 	if(np->rx_hook)
-		np->rx_flags = NETPOLL_RX_ENABLED;
+		npinfo->rx_flags = NETPOLL_RX_ENABLED;
+	npinfo->np = np;
+	ndev->npinfo = npinfo;
 
 	return 0;
 
  release:
-	ndev->np = NULL;
+	if (!ndev->npinfo)
+		kfree(npinfo);
 	np->dev = NULL;
 	dev_put(ndev);
 	return -1;
@@ -706,9 +723,11 @@ int netpoll_setup(struct netpoll *np)
 
 void netpoll_cleanup(struct netpoll *np)
 {
-	if (np->dev)
-		np->dev->np = NULL;
-	dev_put(np->dev);
+	if (np->dev) {
+		if (np->dev->npinfo)
+			np->dev->npinfo->np = NULL;
+		dev_put(np->dev);
+	}
 	np->dev = NULL;
 }
 
-- 
cgit 


From fbeec2e1552949002065435c9829dc244ad85407 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 22 Jun 2005 22:05:59 -0700
Subject: [NETPOLL]: allow multiple netpoll_clients to register against one
 interface

This patch provides support for registering multiple netpoll clients to the
same network device.  Only one of these clients may register an rx_hook,
however.  In practice, this restriction has not been problematic.  It is
worth mentioning, though, that the current design can be easily extended to
allow for the registration of multiple rx_hooks.

The basic idea of the patch is that the rx_np pointer in the netpoll_info
structure points to the struct netpoll that has rx_hook filled in.  Aside
from this one case, there is no need for a pointer from the struct
net_device to an individual struct netpoll.

A lock is introduced to protect the setting and clearing of the np_rx
pointer.  The pointer will only be cleared upon netpoll client module
removal, and the lock should be uncontested.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index ab3c0c9713b0..c327c9edadc5 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -349,11 +349,15 @@ static void arp_reply(struct sk_buff *skb)
 	unsigned char *arp_ptr;
 	int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
 	u32 sip, tip;
+	unsigned long flags;
 	struct sk_buff *send_skb;
 	struct netpoll *np = NULL;
 
-	if (npinfo)
-		np = npinfo->np;
+	spin_lock_irqsave(&npinfo->rx_lock, flags);
+	if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev)
+		np = npinfo->rx_np;
+	spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+
 	if (!np)
 		return;
 
@@ -436,9 +440,9 @@ int __netpoll_rx(struct sk_buff *skb)
 	int proto, len, ulen;
 	struct iphdr *iph;
 	struct udphdr *uh;
-	struct netpoll *np = skb->dev->npinfo->np;
+	struct netpoll *np = skb->dev->npinfo->rx_np;
 
-	if (!np->rx_hook)
+	if (!np)
 		goto out;
 	if (skb->dev->type != ARPHRD_ETHER)
 		goto out;
@@ -619,6 +623,7 @@ int netpoll_setup(struct netpoll *np)
 	struct net_device *ndev = NULL;
 	struct in_device *in_dev;
 	struct netpoll_info *npinfo;
+	unsigned long flags;
 
 	if (np->dev_name)
 		ndev = dev_get_by_name(np->dev_name);
@@ -634,9 +639,10 @@ int netpoll_setup(struct netpoll *np)
 		if (!npinfo)
 			goto release;
 
-		npinfo->np = NULL;
+		npinfo->rx_np = NULL;
 		npinfo->poll_lock = SPIN_LOCK_UNLOCKED;
 		npinfo->poll_owner = -1;
+		npinfo->rx_lock = SPIN_LOCK_UNLOCKED;
 	} else
 		npinfo = ndev->npinfo;
 
@@ -706,9 +712,13 @@ int netpoll_setup(struct netpoll *np)
 		       np->name, HIPQUAD(np->local_ip));
 	}
 
-	if(np->rx_hook)
-		npinfo->rx_flags = NETPOLL_RX_ENABLED;
-	npinfo->np = np;
+	if (np->rx_hook) {
+		spin_lock_irqsave(&npinfo->rx_lock, flags);
+		npinfo->rx_flags |= NETPOLL_RX_ENABLED;
+		npinfo->rx_np = np;
+		spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+	}
+	/* last thing to do is link it to the net device structure */
 	ndev->npinfo = npinfo;
 
 	return 0;
@@ -723,11 +733,20 @@ int netpoll_setup(struct netpoll *np)
 
 void netpoll_cleanup(struct netpoll *np)
 {
+	struct netpoll_info *npinfo;
+	unsigned long flags;
+
 	if (np->dev) {
-		if (np->dev->npinfo)
-			np->dev->npinfo->np = NULL;
+		npinfo = np->dev->npinfo;
+		if (npinfo && npinfo->rx_np == np) {
+			spin_lock_irqsave(&npinfo->rx_lock, flags);
+			npinfo->rx_np = NULL;
+			npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
+			spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+		}
 		dev_put(np->dev);
 	}
+
 	np->dev = NULL;
 }
 
-- 
cgit 


From 7abaa27c1c54208bd76fa8bae55839c034aebfb2 Mon Sep 17 00:00:00 2001
From: Chuck Short <zulcss@gmail.com>
Date: Wed, 22 Jun 2005 22:10:23 -0700
Subject: [IPV4]: Fix route.c gcc4 warnings

Signed-off by: Chuck Short <zulcss@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f4d53c919869..80cf633d9f4a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1767,7 +1767,7 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb,
 				       struct in_device *in_dev,
 				       u32 daddr, u32 saddr, u32 tos)
 {
-	struct rtable* rth;
+	struct rtable* rth = NULL;
 	int err;
 	unsigned hash;
 
@@ -1794,7 +1794,7 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
 				   u32 daddr, u32 saddr, u32 tos)
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	struct rtable* rth;
+	struct rtable* rth = NULL;
 	unsigned char hop, hopcount, lasthop;
 	int err = -EINVAL;
 	unsigned int hash;
@@ -2239,7 +2239,7 @@ static inline int ip_mkroute_output_def(struct rtable **rp,
 					struct net_device *dev_out,
 					unsigned flags)
 {
-	struct rtable *rth;
+	struct rtable *rth = NULL;
 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
 	unsigned hash;
 	if (err == 0) {
@@ -2267,7 +2267,7 @@ static inline int ip_mkroute_output(struct rtable** rp,
 	unsigned char hop;
 	unsigned hash;
 	int err = -EINVAL;
-	struct rtable *rth;
+	struct rtable *rth = NULL;
 
 	if (res->fi && res->fi->fib_nhs > 1) {
 		unsigned char hopcount = res->fi->fib_nhs;
-- 
cgit 


From 285b3afefacff14bc98e5754b8b48a0a2b42f0df Mon Sep 17 00:00:00 2001
From: Nishanth Aravamudan <nacc@us.ibm.com>
Date: Wed, 22 Jun 2005 22:11:44 -0700
Subject: [ATALK] aarp: replace schedule_timeout() with msleep()

From: Nishanth Aravamudan <nacc@us.ibm.com>

Use msleep() instead of schedule_timeout() to guarantee the task
delays as expected. The current code is not wrong, but it does not account for
early return due to signals, so I think msleep() should be appropriate.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/appletalk/aarp.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index 10d040461021..c34614ea5fce 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -35,6 +35,7 @@
 #include <net/datalink.h>
 #include <net/psnap.h>
 #include <linux/atalk.h>
+#include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -462,8 +463,7 @@ void aarp_probe_network(struct atalk_iface *atif)
 			aarp_send_probe(atif->dev, &atif->address);
 
 			/* Defer 1/10th */
-			current->state = TASK_INTERRUPTIBLE;
-			schedule_timeout(HZ / 10);
+			msleep(100);
 
 			if (atif->status & ATIF_PROBE_FAIL)
 				break;
@@ -510,9 +510,8 @@ int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa)
 		aarp_send_probe(atif->dev, sa);
 
 		/* Defer 1/10th */
-		current->state = TASK_INTERRUPTIBLE;
 		write_unlock_bh(&aarp_lock);
-		schedule_timeout(HZ / 10);
+		msleep(100);
 		write_lock_bh(&aarp_lock);
 
 		if (entry->status & ATIF_PROBE_FAIL)
-- 
cgit 


From 68d318720052154bc6b2513b0f15d0d947cc53c9 Mon Sep 17 00:00:00 2001
From: James Lamanna <jlamanna@gmail.com>
Date: Wed, 22 Jun 2005 22:12:57 -0700
Subject: [EBTABLES]: vfree() checking cleanups

From: jlamanna@gmail.com

ebtables.c vfree() checking cleanups.

Signed-off by: James Lamanna <jlamanna@gmail.com>
Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/netfilter/ebtables.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 18ebc664769b..c4540144f0f4 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -859,8 +859,7 @@ static int translate_table(struct ebt_replace *repl,
 		if (repl->valid_hooks & (1 << i))
 			if (check_chainloops(newinfo->hook_entry[i],
 			   cl_s, udc_cnt, i, newinfo->entries)) {
-				if (cl_s)
-					vfree(cl_s);
+				vfree(cl_s);
 				return -EINVAL;
 			}
 
@@ -883,8 +882,7 @@ static int translate_table(struct ebt_replace *repl,
 		EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
 		   ebt_cleanup_entry, &i);
 	}
-	if (cl_s)
-		vfree(cl_s);
+	vfree(cl_s);
 	return ret;
 }
 
@@ -1030,8 +1028,7 @@ static int do_replace(void __user *user, unsigned int len)
 	}
 	vfree(table);
 
-	if (counterstmp)
-		vfree(counterstmp);
+	vfree(counterstmp);
 	return ret;
 
 free_unlock:
@@ -1040,8 +1037,7 @@ free_iterate:
 	EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
 	   ebt_cleanup_entry, NULL);
 free_counterstmp:
-	if (counterstmp)
-		vfree(counterstmp);
+	vfree(counterstmp);
 	/* can be initialized in translate_table() */
 	if (newinfo->chainstack) {
 		for (i = 0; i < num_possible_cpus(); i++)
@@ -1049,11 +1045,9 @@ free_counterstmp:
 		vfree(newinfo->chainstack);
 	}
 free_entries:
-	if (newinfo->entries)
-		vfree(newinfo->entries);
+	vfree(newinfo->entries);
 free_newinfo:
-	if (newinfo)
-		vfree(newinfo);
+	vfree(newinfo);
 	return ret;
 }
 
@@ -1213,8 +1207,7 @@ void ebt_unregister_table(struct ebt_table *table)
 	down(&ebt_mutex);
 	LIST_DELETE(&ebt_tables, table);
 	up(&ebt_mutex);
-	if (table->private->entries)
-		vfree(table->private->entries);
+	vfree(table->private->entries);
 	if (table->private->chainstack) {
 		for (i = 0; i < num_possible_cpus(); i++)
 			vfree(table->private->chainstack[i]);
-- 
cgit 


From cb65d506c34c86df5bcef939ce5a8666a451bd8b Mon Sep 17 00:00:00 2001
From: Shaun Pereira <spereira@tusc.com.au>
Date: Wed, 22 Jun 2005 22:15:01 -0700
Subject: [X25]: Selective sub-address matching with call user data.

From: Shaun Pereira <spereira@tusc.com.au>

This is the first (independent of the second) patch of two that I am
working on with x25 on linux (tested with xot on a cisco router).  Details
are as follows.

Current state of module:

A server using the current implementation (2.6.11.7) of the x25 module will
accept a call request/ incoming call packet at the listening x.25 address,
from all callers to that address, as long as NO call user data is present
in the packet header.

If the server needs to choose to accept a particular call request/ incoming
call packet arriving at its listening x25 address, then the kernel has to
allow a match of call user data present in the call request packet with its
own.  This is required when multiple servers listen at the same x25 address
and device interface.  The kernel currently matches ALL call user data, if
present.

Current Changes:

This patch is a follow up to the patch submitted previously by Andrew
Hendry, and allows the user to selectively control the number of octets of
call user data in the call request packet, that the kernel will match.  By
default no call user data is matched, even if call user data is present.
To allow call user data matching, a cudmatchlength > 0 has to be passed
into the kernel after which the passed number of octets will be matched.
Otherwise the kernel behavior is exactly as the original implementation.

This patch also ensures that as is normally the case, no call user data
will be present in the Call accepted / call connected packet sent back to
the caller

Future Changes on next patch:

There are cases however when call user data may be present in the call
accepted packet.  According to the X.25 recommendation (ITU-T 10/96)
section 5.2.3.2 call user data may be present in the call accepted packet
provided the fast select facility is used.  My next patch will include this
fast select utility and the ability to send up to 128 octets call user data
in the call accepted packet provided the fast select facility is used.  I
am currently testing this, again with xot on linux and cisco.

Signed-off-by: Shaun Pereira <spereira@tusc.com.au>

(With a fix from Alexey Dobriyan <adobriyan@gmail.com>)
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/x25/af_x25.c   | 73 +++++++++++++++++++++++++++++++++++-------------------
 net/x25/x25_subr.c | 18 --------------
 2 files changed, 48 insertions(+), 43 deletions(-)

(limited to 'net')

diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 2a24b243b841..e17d84a55d5e 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -29,6 +29,8 @@
  *	2000-11-14	Henner Eisen    Closing datalink from NETDEV_GOING_DOWN
  *	2002-10-06	Arnaldo C. Melo Get rid of cli/sti, move proc stuff to
  *					x25_proc.c, using seq_file
+ *	2005-04-02	Shaun Pereira	Selective sub address matching
+ *					with call user data
  */
 
 #include <linux/config.h>
@@ -219,7 +221,8 @@ static void x25_insert_socket(struct sock *sk)
  *	Note: if a listening socket has cud set it must only get calls
  *	with matching cud.
  */
-static struct sock *x25_find_listener(struct x25_address *addr, struct x25_calluserdata *calluserdata)
+static struct sock *x25_find_listener(struct x25_address *addr,
+					struct sk_buff *skb)
 {
 	struct sock *s;
 	struct sock *next_best;
@@ -230,22 +233,23 @@ static struct sock *x25_find_listener(struct x25_address *addr, struct x25_callu
 
 	sk_for_each(s, node, &x25_list)
 		if ((!strcmp(addr->x25_addr,
-			     x25_sk(s)->source_addr.x25_addr) ||
-		     !strcmp(addr->x25_addr,
-			     null_x25_address.x25_addr)) &&
-		     s->sk_state == TCP_LISTEN) {
-
+			x25_sk(s)->source_addr.x25_addr) ||
+				!strcmp(addr->x25_addr,
+					null_x25_address.x25_addr)) &&
+					s->sk_state == TCP_LISTEN) {
 			/*
 			 * Found a listening socket, now check the incoming
 			 * call user data vs this sockets call user data
 			 */
-			if (x25_check_calluserdata(&x25_sk(s)->calluserdata, calluserdata)) {
-				sock_hold(s);
-				goto found;
-			}
-			if (x25_sk(s)->calluserdata.cudlength == 0) {
+			if(skb->len > 0 && x25_sk(s)->cudmatchlength > 0) {
+			 	if((memcmp(x25_sk(s)->calluserdata.cuddata,
+			 		skb->data,
+					x25_sk(s)->cudmatchlength)) == 0) {
+					sock_hold(s);
+					goto found;
+				 }
+			} else
 				next_best = s;
-			}
 		}
 	if (next_best) {
 		s = next_best;
@@ -497,6 +501,7 @@ static int x25_create(struct socket *sock, int protocol)
 	x25->t23   = sysctl_x25_clear_request_timeout;
 	x25->t2    = sysctl_x25_ack_holdback_timeout;
 	x25->state = X25_STATE_0;
+	x25->cudmatchlength = 0;
 
 	x25->facilities.winsize_in  = X25_DEFAULT_WINDOW_SIZE;
 	x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE;
@@ -545,6 +550,7 @@ static struct sock *x25_make_new(struct sock *osk)
 	x25->t2         = ox25->t2;
 	x25->facilities = ox25->facilities;
 	x25->qbitincl   = ox25->qbitincl;
+	x25->cudmatchlength = ox25->cudmatchlength;
 
 	x25_init_timers(sk);
 out:
@@ -822,7 +828,6 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	struct x25_sock *makex25;
 	struct x25_address source_addr, dest_addr;
 	struct x25_facilities facilities;
-	struct x25_calluserdata calluserdata;
 	int len, rc;
 
 	/*
@@ -844,20 +849,11 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	len = skb->data[0] + 1;
 	skb_pull(skb,len);
 
-	/*
-	 *	Incoming Call User Data.
-	 */
-	if (skb->len >= 0) {
-		memcpy(calluserdata.cuddata, skb->data, skb->len);
-		calluserdata.cudlength = skb->len;
-	}
-
-	skb_push(skb,len);
-
 	/*
 	 *	Find a listener for the particular address/cud pair.
 	 */
-	sk = x25_find_listener(&source_addr,&calluserdata);
+	sk = x25_find_listener(&source_addr,skb);
+	skb_push(skb,len);
 
 	/*
 	 *	We can't accept the Call Request.
@@ -900,12 +896,22 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	makex25->neighbour     = nb;
 	makex25->facilities    = facilities;
 	makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask;
-	makex25->calluserdata  = calluserdata;
+	/* ensure no reverse facil on accept */
+	makex25->vc_facil_mask &= ~X25_MASK_REVERSE;
+	makex25->cudmatchlength = x25_sk(sk)->cudmatchlength;
 
 	x25_write_internal(make, X25_CALL_ACCEPTED);
 
 	makex25->state = X25_STATE_3;
 
+	/*
+	 *	Incoming Call User Data.
+	 */
+	if (skb->len >= 0) {
+		memcpy(makex25->calluserdata.cuddata, skb->data, skb->len);
+		makex25->calluserdata.cudlength = skb->len;
+	}
+
 	sk->sk_ack_backlog++;
 
 	x25_insert_socket(make);
@@ -1325,6 +1331,23 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			break;
 		}
 
+		case SIOCX25SCUDMATCHLEN: {
+			struct x25_subaddr sub_addr;
+			rc = -EINVAL;
+			if(sk->sk_state != TCP_CLOSE)
+				break;
+			rc = -EFAULT;
+			if (copy_from_user(&sub_addr, argp,
+					sizeof(sub_addr)))
+				break;
+		 	rc = -EINVAL;
+			if(sub_addr.cudmatchlength > X25_MAX_CUD_LEN)
+				break;
+			x25->cudmatchlength = sub_addr.cudmatchlength;
+			rc = 0;
+			break;
+		}
+
  		default:
 			rc = dev_ioctl(cmd, argp);
 			break;
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index 183fea3bba67..c349bbd61684 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -354,21 +354,3 @@ void x25_check_rbuf(struct sock *sk)
 	}
 }
 
-/*
- * Compare 2 calluserdata structures, used to find correct listening sockets
- * when call user data is used.
- */
-int x25_check_calluserdata(struct x25_calluserdata *ours, struct x25_calluserdata *theirs)
-{
-	int i;
-	if (ours->cudlength != theirs->cudlength)
-		return 0;
-
-	for (i=0;i<ours->cudlength;i++) {
-		if (ours->cuddata[i] != theirs->cuddata[i]) {
-			return 0;
-		}
-	}
-	return 1;
-}
-
-- 
cgit 


From ebc3f64b864fc16a594c2e63bf55a55c7d42084b Mon Sep 17 00:00:00 2001
From: Shaun Pereira <spereira@tusc.com.au>
Date: Wed, 22 Jun 2005 22:16:17 -0700
Subject: [X25]: Fast select with no restriction on response

This patch is a follow up to patch 1 regarding "Selective Sub Address
matching with call user data".  It allows use of the Fast-Select-Acceptance
optional user facility for X.25.

This patch just implements fast select with no restriction on response
(NRR).  What this means (according to ITU-T Recomendation 10/96 section
6.16) is that if in an incoming call packet, the relevant facility bits are
set for fast-select-NRR, then the called DTE can issue a direct response to
the incoming packet using a call-accepted packet that contains
call-user-data.  This patch allows such a response.

The called DTE can also respond with a clear-request packet that contains
call-user-data.  However, this feature is currently not implemented by the
patch.

How is Fast Select Acceptance used?
By default, the system does not allow fast select acceptance (as before).
To enable a response to fast select acceptance,
After a listen socket in created and bound as follows
	socket(AF_X25, SOCK_SEQPACKET, 0);
	bind(call_soc, (struct sockaddr *)&locl_addr, sizeof(locl_addr));
but before a listen system call is made, the following ioctl should be used.
	ioctl(call_soc,SIOCX25CALLACCPTAPPRV);
Now the listen system call can be made
	listen(call_soc, 4);
After this, an incoming-call packet will be accepted, but no call-accepted
packet will be sent back until the following system call is made on the socket
that accepts the call
	ioctl(vc_soc,SIOCX25SENDCALLACCPT);
The network (or cisco xot router used for testing here) will allow the
application server's call-user-data in the call-accepted packet,
provided the call-request was made with Fast-select NRR.

Signed-off-by: Shaun Pereira <spereira@tusc.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/x25/af_x25.c         | 37 +++++++++++++++++++++++++++++++++----
 net/x25/x25_facilities.c | 34 +++++++++++++++++++++++++++++-----
 net/x25/x25_subr.c       | 23 ++++++++++++++++++-----
 3 files changed, 80 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index e17d84a55d5e..04bec047fa9a 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -31,6 +31,8 @@
  *					x25_proc.c, using seq_file
  *	2005-04-02	Shaun Pereira	Selective sub address matching
  *					with call user data
+ *	2005-04-15	Shaun Pereira	Fast select with no restriction on
+ *					response
  */
 
 #include <linux/config.h>
@@ -502,6 +504,8 @@ static int x25_create(struct socket *sock, int protocol)
 	x25->t2    = sysctl_x25_ack_holdback_timeout;
 	x25->state = X25_STATE_0;
 	x25->cudmatchlength = 0;
+	x25->accptapprv = X25_DENY_ACCPT_APPRV;		/* normally no cud  */
+							/* on call accept   */
 
 	x25->facilities.winsize_in  = X25_DEFAULT_WINDOW_SIZE;
 	x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE;
@@ -551,6 +555,7 @@ static struct sock *x25_make_new(struct sock *osk)
 	x25->facilities = ox25->facilities;
 	x25->qbitincl   = ox25->qbitincl;
 	x25->cudmatchlength = ox25->cudmatchlength;
+	x25->accptapprv = ox25->accptapprv;
 
 	x25_init_timers(sk);
 out:
@@ -900,9 +905,11 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	makex25->vc_facil_mask &= ~X25_MASK_REVERSE;
 	makex25->cudmatchlength = x25_sk(sk)->cudmatchlength;
 
-	x25_write_internal(make, X25_CALL_ACCEPTED);
-
-	makex25->state = X25_STATE_3;
+	/* Normally all calls are accepted immediatly */
+	if(makex25->accptapprv & X25_DENY_ACCPT_APPRV) {
+		x25_write_internal(make, X25_CALL_ACCEPTED);
+		makex25->state = X25_STATE_3;
+	}
 
 	/*
 	 *	Incoming Call User Data.
@@ -1294,7 +1301,8 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			if (facilities.throughput < 0x03 ||
 			    facilities.throughput > 0xDD)
 				break;
-			if (facilities.reverse && facilities.reverse != 1)
+			if (facilities.reverse &&
+				(facilities.reverse | 0x81)!= 0x81)
 				break;
 			x25->facilities = facilities;
 			rc = 0;
@@ -1348,6 +1356,27 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			break;
 		}
 
+		case SIOCX25CALLACCPTAPPRV: {
+			rc = -EINVAL;
+			if (sk->sk_state != TCP_CLOSE)
+				break;
+			x25->accptapprv = X25_ALLOW_ACCPT_APPRV;
+			rc = 0;
+			break;
+		}
+
+		case SIOCX25SENDCALLACCPT:  {
+			rc = -EINVAL;
+			if (sk->sk_state != TCP_ESTABLISHED)
+				break;
+			if (x25->accptapprv)	/* must call accptapprv above */
+				break;
+			x25_write_internal(sk, X25_CALL_ACCEPTED);
+			x25->state = X25_STATE_3;
+			rc = 0;
+			break;
+		}
+
  		default:
 			rc = dev_ioctl(cmd, argp);
 			break;
diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c
index a21bdb95f9a8..54278b962f4c 100644
--- a/net/x25/x25_facilities.c
+++ b/net/x25/x25_facilities.c
@@ -17,6 +17,8 @@
  *	X.25 001	Split from x25_subr.c
  *	mar/20/00	Daniela Squassoni Disabling/enabling of facilities 
  *					  negotiation.
+ *	apr/14/05	Shaun Pereira - Allow fast select with no restriction
+ *					on response.
  */
 
 #include <linux/kernel.h>
@@ -43,9 +45,31 @@ int x25_parse_facilities(struct sk_buff *skb,
 		case X25_FAC_CLASS_A:
 			switch (*p) {
 			case X25_FAC_REVERSE:
-				facilities->reverse = p[1] & 0x01;
-				*vc_fac_mask |= X25_MASK_REVERSE;
-				break;
+				if((p[1] & 0x81) == 0x81) {
+					facilities->reverse = p[1] & 0x81;
+					*vc_fac_mask |= X25_MASK_REVERSE;
+					break;
+				}
+
+				if((p[1] & 0x01) == 0x01) {
+					facilities->reverse = p[1] & 0x01;
+					*vc_fac_mask |= X25_MASK_REVERSE;
+					break;
+				}
+
+				if((p[1] & 0x80) == 0x80) {
+					facilities->reverse = p[1] & 0x80;
+					*vc_fac_mask |= X25_MASK_REVERSE;
+					break;
+				}
+
+				if(p[1] == 0x00) {
+					facilities->reverse
+						= X25_DEFAULT_REVERSE;
+					*vc_fac_mask |= X25_MASK_REVERSE;
+					break;
+				}
+
 			case X25_FAC_THROUGHPUT:
 				facilities->throughput = p[1];
 				*vc_fac_mask |= X25_MASK_THROUGHPUT;
@@ -122,7 +146,7 @@ int x25_create_facilities(unsigned char *buffer,
 
 	if (facilities->reverse && (facil_mask & X25_MASK_REVERSE)) {
 		*p++ = X25_FAC_REVERSE;
-		*p++ = !!facilities->reverse;
+		*p++ = facilities->reverse;
 	}
 
 	if (facilities->throughput && (facil_mask & X25_MASK_THROUGHPUT)) {
@@ -171,7 +195,7 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk,
 	/*
 	 *	They want reverse charging, we won't accept it.
 	 */
-	if (theirs.reverse && ours->reverse) {
+	if ((theirs.reverse & 0x01 ) && (ours->reverse & 0x01)) {
 		SOCK_DEBUG(sk, "X.25: rejecting reverse charging request");
 		return -1;
 	}
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index c349bbd61684..7fd872ad0c20 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -19,6 +19,8 @@
  *	mar/20/00	Daniela Squassoni Disabling/enabling of facilities
  *					  negotiation.
  *	jun/24/01	Arnaldo C. Melo	  use skb_queue_purge, cleanups
+ *	apr/04/15	Shaun Pereira		Fast select with no
+ *						restriction on response.
  */
 
 #include <linux/kernel.h>
@@ -127,8 +129,12 @@ void x25_write_internal(struct sock *sk, int frametype)
 			len += 1 + X25_ADDR_LEN + X25_MAX_FAC_LEN +
 			       X25_MAX_CUD_LEN;
 			break;
-		case X25_CALL_ACCEPTED:
-			len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN;
+		case X25_CALL_ACCEPTED: /* fast sel with no restr on resp */
+			if(x25->facilities.reverse & 0x80) {
+				len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN;
+			} else {
+				len += 1 + X25_MAX_FAC_LEN;
+			}
 			break;
 		case X25_CLEAR_REQUEST:
 		case X25_RESET_REQUEST:
@@ -203,9 +209,16 @@ void x25_write_internal(struct sock *sk, int frametype)
 							x25->vc_facil_mask);
 			dptr    = skb_put(skb, len);
 			memcpy(dptr, facilities, len);
-			dptr = skb_put(skb, x25->calluserdata.cudlength);
-			memcpy(dptr, x25->calluserdata.cuddata,
-			       x25->calluserdata.cudlength);
+
+			/* fast select with no restriction on response
+				allows call user data. Userland must
+				ensure it is ours and not theirs */
+			if(x25->facilities.reverse & 0x80) {
+				dptr = skb_put(skb,
+					x25->calluserdata.cudlength);
+				memcpy(dptr, x25->calluserdata.cuddata,
+				       x25->calluserdata.cudlength);
+			}
 			x25->calluserdata.cudlength = 0;
 			break;
 
-- 
cgit 


From 543537bd922692bc978e2e356fcd8bfc9c2ee7d5 Mon Sep 17 00:00:00 2001
From: Paulo Marques <pmarques@grupopie.com>
Date: Thu, 23 Jun 2005 00:09:02 -0700
Subject: [PATCH] create a kstrdup library function

This patch creates a new kstrdup library function and changes the "local"
implementations in several places to use this function.

Most of the changes come from the sound and net subsystems.  The sound part
had already been acknowledged by Takashi Iwai and the net part by David S.
Miller.

I left UML alone for now because I would need more time to read the code
carefully before making changes there.

Signed-off-by: Paulo Marques <pmarques@grupopie.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/core/neighbour.c       |  3 ++-
 net/core/sysctl_net_core.c | 15 ---------------
 net/ipv4/devinet.c         |  2 +-
 net/ipv6/addrconf.c        |  3 ++-
 net/sunrpc/svcauth_unix.c  | 11 ++---------
 5 files changed, 7 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index f6bdcad47da6..851eb927ed97 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -32,6 +32,7 @@
 #include <net/sock.h>
 #include <linux/rtnetlink.h>
 #include <linux/random.h>
+#include <linux/string.h>
 
 #define NEIGH_DEBUG 1
 
@@ -2592,7 +2593,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
 		t->neigh_vars[17].extra1 = dev;
 	}
 
-	dev_name = net_sysctl_strdup(dev_name_source);
+	dev_name = kstrdup(dev_name_source, GFP_KERNEL);
 	if (!dev_name) {
 		err = -ENOBUFS;
 		goto free;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index c8be646cb191..880a88815211 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -35,19 +35,6 @@ extern int sysctl_somaxconn;
 extern char sysctl_divert_version[];
 #endif /* CONFIG_NET_DIVERT */
 
-/*
- * This strdup() is used for creating copies of network 
- * device names to be handed over to sysctl.
- */
- 
-char *net_sysctl_strdup(const char *s)
-{
-	char *rv = kmalloc(strlen(s)+1, GFP_KERNEL);
-	if (rv)
-		strcpy(rv, s);
-	return rv;
-}
-
 ctl_table core_table[] = {
 #ifdef CONFIG_NET
 	{
@@ -177,6 +164,4 @@ ctl_table core_table[] = {
 	{ .ctl_name = 0 }
 };
 
-EXPORT_SYMBOL(net_sysctl_strdup);
-
 #endif
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 650dcb12d9a1..d8a10e3dd77d 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1471,7 +1471,7 @@ static void devinet_sysctl_register(struct in_device *in_dev,
 	 * by sysctl and we wouldn't want anyone to change it under our feet
 	 * (see SIOCSIFNAME).
 	 */	
-	dev_name = net_sysctl_strdup(dev_name);
+	dev_name = kstrdup(dev_name, GFP_KERNEL);
 	if (!dev_name)
 	    goto free;
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 14f5c53235fe..a54d4ef3fd35 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -57,6 +57,7 @@
 #endif
 #include <linux/delay.h>
 #include <linux/notifier.h>
+#include <linux/string.h>
 
 #include <net/sock.h>
 #include <net/snmp.h>
@@ -3437,7 +3438,7 @@ static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf
 	 * by sysctl and we wouldn't want anyone to change it under our feet
 	 * (see SIOCSIFNAME).
 	 */	
-	dev_name = net_sysctl_strdup(dev_name);
+	dev_name = kstrdup(dev_name, GFP_KERNEL);
 	if (!dev_name)
 	    goto free;
 
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 2b99b4028d31..d6baf6fdf8a9 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -8,6 +8,7 @@
 #include <linux/err.h>
 #include <linux/seq_file.h>
 #include <linux/hash.h>
+#include <linux/string.h>
 
 #define RPCDBG_FACILITY	RPCDBG_AUTH
 
@@ -20,14 +21,6 @@
  */
 
 
-static char *strdup(char *s)
-{
-	char *rv = kmalloc(strlen(s)+1, GFP_KERNEL);
-	if (rv)
-		strcpy(rv, s);
-	return rv;
-}
-
 struct unix_domain {
 	struct auth_domain	h;
 	int	addr_changes;
@@ -55,7 +48,7 @@ struct auth_domain *unix_domain_find(char *name)
 	if (new == NULL)
 		return NULL;
 	cache_init(&new->h.h);
-	new->h.name = strdup(name);
+	new->h.name = kstrdup(name, GFP_KERNEL);
 	new->h.flavour = RPC_AUTH_UNIX;
 	new->addr_changes = 0;
 	new->h.h.expiry_time = NEVER;
-- 
cgit 


From 317a76f9a44b437d6301718f4e5d08bd93f98da7 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:19:55 -0700
Subject: [TCP]: Add pluggable congestion control algorithm infrastructure.

Allow TCP to have multiple pluggable congestion control algorithms.
Algorithms are defined by a set of operations and can be built in
or modules.  The legacy "new RENO" algorithm is used as a starting
point and fallback.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Makefile          |   3 +-
 net/ipv4/sysctl_net_ipv4.c | 114 +++----
 net/ipv4/tcp.c             |   2 +
 net/ipv4/tcp_cong.c        | 195 ++++++++++++
 net/ipv4/tcp_diag.c        |  20 +-
 net/ipv4/tcp_input.c       | 737 ++++-----------------------------------------
 net/ipv4/tcp_ipv4.c        |   3 +
 net/ipv4/tcp_minisocks.c   |   4 +-
 net/ipv4/tcp_output.c      |  23 +-
 net/ipv6/tcp_ipv6.c        |   2 +-
 10 files changed, 313 insertions(+), 790 deletions(-)
 create mode 100644 net/ipv4/tcp_cong.c

(limited to 'net')

diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 65d57d8e1add..89c0b4cb470e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -5,7 +5,8 @@
 obj-y     := utils.o route.o inetpeer.o protocol.o \
 	     ip_input.o ip_fragment.o ip_forward.o ip_options.o \
 	     ip_output.o ip_sockglue.o \
-	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \
+	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
+	     tcp_minisocks.o tcp_cong.o \
 	     datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 23068bddbf0b..e32894532416 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -118,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table,
 	return 1;
 }
 
+static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
+				       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	char val[TCP_CA_NAME_MAX];
+	ctl_table tbl = {
+		.data = val,
+		.maxlen = TCP_CA_NAME_MAX,
+	};
+	int ret;
+
+	tcp_get_default_congestion_control(val);
+
+	ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+	if (write && ret == 0)
+		ret = tcp_set_default_congestion_control(val);
+	return ret;
+}
+
+int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen,
+				  void __user *oldval, size_t __user *oldlenp,
+				  void __user *newval, size_t newlen,
+				  void **context)
+{
+	char val[TCP_CA_NAME_MAX];
+	ctl_table tbl = {
+		.data = val,
+		.maxlen = TCP_CA_NAME_MAX,
+	};
+	int ret;
+
+	tcp_get_default_congestion_control(val);
+	ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen,
+			    context);
+	if (ret == 0 && newval && newlen)
+		ret = tcp_set_default_congestion_control(val);
+	return ret;
+}
+
+
 ctl_table ipv4_table[] = {
         {
 		.ctl_name	= NET_IPV4_TCP_TIMESTAMPS,
@@ -611,70 +650,6 @@ ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-	{
-		.ctl_name	= NET_TCP_WESTWOOD, 
-		.procname	= "tcp_westwood",
-		.data		= &sysctl_tcp_westwood,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_VEGAS,
-		.procname	= "tcp_vegas_cong_avoid",
-		.data		= &sysctl_tcp_vegas_cong_avoid,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_VEGAS_ALPHA,
-		.procname	= "tcp_vegas_alpha",
-		.data		= &sysctl_tcp_vegas_alpha,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_VEGAS_BETA,
-		.procname	= "tcp_vegas_beta",
-		.data		= &sysctl_tcp_vegas_beta,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_VEGAS_GAMMA,
-		.procname	= "tcp_vegas_gamma",
-		.data		= &sysctl_tcp_vegas_gamma,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_BIC,
-		.procname	= "tcp_bic",
-		.data		= &sysctl_tcp_bic,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_BIC_FAST_CONVERGENCE,
-		.procname	= "tcp_bic_fast_convergence",
-		.data		= &sysctl_tcp_bic_fast_convergence,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_BIC_LOW_WINDOW,
-		.procname	= "tcp_bic_low_window",
-		.data		= &sysctl_tcp_bic_low_window,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
 	{
 		.ctl_name	= NET_TCP_MODERATE_RCVBUF,
 		.procname	= "tcp_moderate_rcvbuf",
@@ -692,13 +667,14 @@ ctl_table ipv4_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= NET_TCP_BIC_BETA,
-		.procname	= "tcp_bic_beta",
-		.data		= &sysctl_tcp_bic_beta,
-		.maxlen		= sizeof(int),
+		.ctl_name	= NET_TCP_CONG_CONTROL,
+		.procname	= "tcp_congestion_control",
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.maxlen		= TCP_CA_NAME_MAX,
+		.proc_handler	= &proc_tcp_congestion_control,
+		.strategy	= &sysctl_tcp_congestion_control,
 	},
+
 	{ .ctl_name = 0 }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 674bbd8cfd36..f3dbc8dc1263 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2333,6 +2333,8 @@ void __init tcp_init(void)
 	printk(KERN_INFO "TCP: Hash tables configured "
 	       "(established %d bind %d)\n",
 	       tcp_ehash_size << 1, tcp_bhash_size);
+
+	tcp_register_congestion_control(&tcp_reno);
 }
 
 EXPORT_SYMBOL(tcp_accept);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
new file mode 100644
index 000000000000..665394a63ae4
--- /dev/null
+++ b/net/ipv4/tcp_cong.c
@@ -0,0 +1,195 @@
+/*
+ * Plugable TCP congestion control support and newReno
+ * congestion control.
+ * Based on ideas from I/O scheduler suport and Web100.
+ *
+ * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <net/tcp.h>
+
+static DEFINE_SPINLOCK(tcp_cong_list_lock);
+static LIST_HEAD(tcp_cong_list);
+
+/* Simple linear search, don't expect many entries! */
+static struct tcp_congestion_ops *tcp_ca_find(const char *name)
+{
+	struct tcp_congestion_ops *e;
+
+	list_for_each_entry(e, &tcp_cong_list, list) {
+		if (strcmp(e->name, name) == 0)
+			return e;
+	}
+
+	return NULL;
+}
+
+/*
+ * Attach new congestion control algorthim to the list
+ * of available options.
+ */
+int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+{
+	int ret = 0;
+
+	/* all algorithms must implement ssthresh and cong_avoid ops */
+	if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) {
+		printk(KERN_ERR "TCP %s does not implement required ops\n",
+		       ca->name);
+		return -EINVAL;
+	}
+
+	spin_lock(&tcp_cong_list_lock);
+	if (tcp_ca_find(ca->name)) {
+		printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
+		ret = -EEXIST;
+	} else {
+		list_add_rcu(&ca->list, &tcp_cong_list);
+		printk(KERN_INFO "TCP %s registered\n", ca->name);
+	}
+	spin_unlock(&tcp_cong_list_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
+
+/*
+ * Remove congestion control algorithm, called from
+ * the module's remove function.  Module ref counts are used
+ * to ensure that this can't be done till all sockets using
+ * that method are closed.
+ */
+void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
+{
+	spin_lock(&tcp_cong_list_lock);
+	list_del_rcu(&ca->list);
+	spin_unlock(&tcp_cong_list_lock);
+}
+EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
+
+/* Assign choice of congestion control. */
+void tcp_init_congestion_control(struct tcp_sock *tp)
+{
+	struct tcp_congestion_ops *ca;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+		if (try_module_get(ca->owner)) {
+			tp->ca_ops = ca;
+			break;
+		}
+
+	}
+	rcu_read_unlock();
+
+	if (tp->ca_ops->init)
+		tp->ca_ops->init(tp);
+}
+
+/* Manage refcounts on socket close. */
+void tcp_cleanup_congestion_control(struct tcp_sock *tp)
+{
+	if (tp->ca_ops->release)
+		tp->ca_ops->release(tp);
+	module_put(tp->ca_ops->owner);
+}
+
+/* Used by sysctl to change default congestion control */
+int tcp_set_default_congestion_control(const char *name)
+{
+	struct tcp_congestion_ops *ca;
+	int ret = -ENOENT;
+
+	spin_lock(&tcp_cong_list_lock);
+	ca = tcp_ca_find(name);
+#ifdef CONFIG_KMOD
+	if (!ca) {
+		spin_unlock(&tcp_cong_list_lock);
+
+		request_module("tcp_%s", name);
+		spin_lock(&tcp_cong_list_lock);
+		ca = tcp_ca_find(name);
+	}
+#endif
+
+	if (ca) {
+		list_move(&ca->list, &tcp_cong_list);
+		ret = 0;
+	}
+	spin_unlock(&tcp_cong_list_lock);
+
+	return ret;
+}
+
+/* Get current default congestion control */
+void tcp_get_default_congestion_control(char *name)
+{
+	struct tcp_congestion_ops *ca;
+	/* We will always have reno... */
+	BUG_ON(list_empty(&tcp_cong_list));
+
+	rcu_read_lock();
+	ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
+	strncpy(name, ca->name, TCP_CA_NAME_MAX);
+	rcu_read_unlock();
+}
+
+/*
+ * TCP Reno congestion control
+ * This is special case used for fallback as well.
+ */
+/* This is Jacobson's slow start and congestion avoidance.
+ * SIGCOMM '88, p. 328.
+ */
+void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
+			 int flag)
+{
+	if (in_flight < tp->snd_cwnd)
+		return;
+
+        if (tp->snd_cwnd <= tp->snd_ssthresh) {
+                /* In "safe" area, increase. */
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+			tp->snd_cwnd++;
+	} else {
+                /* In dangerous area, increase slowly.
+		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
+		 */
+		if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+				tp->snd_cwnd++;
+			tp->snd_cwnd_cnt = 0;
+		} else
+			tp->snd_cwnd_cnt++;
+	}
+}
+EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
+
+/* Slow start threshold is half the congestion window (min 2) */
+u32 tcp_reno_ssthresh(struct tcp_sock *tp)
+{
+	return max(tp->snd_cwnd >> 1U, 2U);
+}
+EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
+
+/* Lower bound on congestion window. */
+u32 tcp_reno_min_cwnd(struct tcp_sock *tp)
+{
+	return tp->snd_ssthresh/2;
+}
+EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
+
+struct tcp_congestion_ops tcp_reno = {
+	.name		= "reno",
+	.owner		= THIS_MODULE,
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+};
+
+EXPORT_SYMBOL_GPL(tcp_reno);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 634befc07921..867acc0f79d8 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -42,7 +42,6 @@ struct tcpdiag_entry
 
 static struct sock *tcpnl;
 
-
 #define TCPDIAG_PUT(skb, attrtype, attrlen) \
 ({ int rtalen = RTA_LENGTH(attrlen);        \
    struct rtattr *rta;                      \
@@ -61,7 +60,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
 	struct nlmsghdr  *nlh;
 	struct tcp_info  *info = NULL;
 	struct tcpdiag_meminfo  *minfo = NULL;
-	struct tcpvegas_info *vinfo = NULL;
 	unsigned char	 *b = skb->tail;
 
 	nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
@@ -73,9 +71,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
 		if (ext & (1<<(TCPDIAG_INFO-1)))
 			info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
 		
-		if ((tcp_is_westwood(tp) || tcp_is_vegas(tp))
-		    && (ext & (1<<(TCPDIAG_VEGASINFO-1))))
-			vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo));
 	}
 	r->tcpdiag_family = sk->sk_family;
 	r->tcpdiag_state = sk->sk_state;
@@ -166,19 +161,8 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
 	if (info) 
 		tcp_get_info(sk, info);
 
-	if (vinfo) {
-		if (tcp_is_vegas(tp)) {
-			vinfo->tcpv_enabled = tp->vegas.doing_vegas_now;
-			vinfo->tcpv_rttcnt = tp->vegas.cntRTT;
-			vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT);
-			vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT);
-		} else {
-			vinfo->tcpv_enabled = 0;
-			vinfo->tcpv_rttcnt = 0;
-			vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt);
-			vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min);
-		}
-	}
+	if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info)
+		tp->ca_ops->get_info(tp, ext, skb);
 
 	nlh->nlmsg_len = skb->tail - b;
 	return skb->len;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5bad504630a3..7bbbbc33eb4b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -61,7 +61,6 @@
  *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
  *					engine. Lots of bugs are found.
  *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
- *		Angelo Dell'Aera:	TCP Westwood+ support
  */
 
 #include <linux/config.h>
@@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337;
 int sysctl_tcp_max_orphans = NR_FILE;
 int sysctl_tcp_frto;
 int sysctl_tcp_nometrics_save;
-int sysctl_tcp_westwood;
-int sysctl_tcp_vegas_cong_avoid;
 
 int sysctl_tcp_moderate_rcvbuf = 1;
 
-/* Default values of the Vegas variables, in fixed-point representation
- * with V_PARAM_SHIFT bits to the right of the binary point.
- */
-#define V_PARAM_SHIFT 1
-int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
-int sysctl_tcp_vegas_beta  = 3<<V_PARAM_SHIFT;
-int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
-int sysctl_tcp_bic = 1;
-int sysctl_tcp_bic_fast_convergence = 1;
-int sysctl_tcp_bic_low_window = 14;
-int sysctl_tcp_bic_beta = 819;		/* = 819/1024 (BICTCP_BETA_SCALE) */
-
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
 #define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
@@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk)
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-static void init_bictcp(struct tcp_sock *tp)
-{
-	tp->bictcp.cnt = 0;
-
-	tp->bictcp.last_max_cwnd = 0;
-	tp->bictcp.last_cwnd = 0;
-	tp->bictcp.last_stamp = 0;
-}
-
 /* 5. Recalculate window clamp after socket hit its memory bounds. */
 static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
 {
@@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
 		tcp_grow_window(sk, tp, skb);
 }
 
-/* When starting a new connection, pin down the current choice of 
- * congestion algorithm.
- */
-void tcp_ca_init(struct tcp_sock *tp)
-{
-	if (sysctl_tcp_westwood) 
-		tp->adv_cong = TCP_WESTWOOD;
-	else if (sysctl_tcp_bic)
-		tp->adv_cong = TCP_BIC;
-	else if (sysctl_tcp_vegas_cong_avoid) {
-		tp->adv_cong = TCP_VEGAS;
-		tp->vegas.baseRTT = 0x7fffffff;
-		tcp_vegas_enable(tp);
-	} 
-}
-
-/* Do RTT sampling needed for Vegas.
- * Basically we:
- *   o min-filter RTT samples from within an RTT to get the current
- *     propagation delay + queuing delay (we are min-filtering to try to
- *     avoid the effects of delayed ACKs)
- *   o min-filter RTT samples from a much longer window (forever for now)
- *     to find the propagation delay (baseRTT)
- */
-static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
-{
-	__u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
-
-	/* Filter to find propagation delay: */
-	if (vrtt < tp->vegas.baseRTT) 
-		tp->vegas.baseRTT = vrtt;
-
-	/* Find the min RTT during the last RTT to find
-	 * the current prop. delay + queuing delay:
-	 */
-	tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
-	tp->vegas.cntRTT++;
-}
-
 /* Called to compute a smoothed rtt estimate. The data fed to this
  * routine either comes from timestamps, or from segments that were
  * known _not_ to have been retransmitted [see Karn/Partridge
@@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
  * To save cycles in the RFC 1323 implementation it was better to break
  * it up into three procedures. -- erics
  */
-static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
+static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
 {
 	long m = mrtt; /* RTT */
 
-	if (tcp_vegas_enabled(tp))
-		vegas_rtt_calc(tp, mrtt);
-
 	/*	The following amusing code comes from Jacobson's
 	 *	article in SIGCOMM '88.  Note that rtt and mdev
 	 *	are scaled versions of rtt and mean deviation.
@@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
 		tp->rtt_seq = tp->snd_nxt;
 	}
 
-	tcp_westwood_update_rtt(tp, tp->srtt >> 3);
+	if (tp->ca_ops->rtt_sample)
+		tp->ca_ops->rtt_sample(tp, *usrtt);
 }
 
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
@@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk)
             tp->snd_una == tp->high_seq ||
             (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
 		tp->prior_ssthresh = tcp_current_ssthresh(tp);
-		if (!tcp_westwood_ssthresh(tp))
-			tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+		tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
+		tcp_ca_event(tp, CA_EVENT_FRTO);
 	}
 
 	/* Have to clear retransmission markers here to keep the bookkeeping
@@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk)
 	tcp_set_ca_state(tp, TCP_CA_Loss);
 	tp->high_seq = tp->frto_highmark;
 	TCP_ECN_queue_cwr(tp);
-
-	init_bictcp(tp);
 }
 
 void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how)
 	if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
 	    (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
 		tp->prior_ssthresh = tcp_current_ssthresh(tp);
-		tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+		tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
+		tcp_ca_event(tp, CA_EVENT_LOSS);
 	}
 	tp->snd_cwnd	   = 1;
 	tp->snd_cwnd_cnt   = 0;
@@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
 }
 
 /* Decrease cwnd each second ack. */
-
 static void tcp_cwnd_down(struct tcp_sock *tp)
 {
 	int decr = tp->snd_cwnd_cnt + 1;
-	__u32 limit;
-
-	/*
-	 * TCP Westwood
-	 * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
-	 * in packets we use mss_cache). If sysctl_tcp_westwood is off
-	 * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is
-	 * still used as usual. It prevents other strange cases in which
-	 * BWE*RTTmin could assume value 0. It should not happen but...
-	 */
-
-	if (!(limit = tcp_westwood_bw_rttmin(tp)))
-		limit = tp->snd_ssthresh/2;
 
 	tp->snd_cwnd_cnt = decr&1;
 	decr >>= 1;
 
-	if (decr && tp->snd_cwnd > limit)
+	if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp))
 		tp->snd_cwnd -= decr;
 
 	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
 static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
 {
 	if (tp->prior_ssthresh) {
-		if (tcp_is_bic(tp))
-			tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd);
+		if (tp->ca_ops->undo_cwnd)
+			tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp);
 		else
 			tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
 
@@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
 
 static inline void tcp_complete_cwr(struct tcp_sock *tp)
 {
-	if (tcp_westwood_cwnd(tp)) 
-		tp->snd_ssthresh = tp->snd_cwnd;
-	else
-		tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
+	tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR);
 }
 
 static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 		if (tp->ca_state < TCP_CA_CWR) {
 			if (!(flag&FLAG_ECE))
 				tp->prior_ssthresh = tcp_current_ssthresh(tp);
-			tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+			tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
 			TCP_ECN_queue_cwr(tp);
 		}
 
@@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 /* Read draft-ietf-tcplw-high-performance before mucking
  * with this code. (Superceeds RFC1323)
  */
-static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
+static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
 {
 	__u32 seq_rtt;
 
@@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
 	 * in window is lost... Voila.	 			--ANK (010210)
 	 */
 	seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
-	tcp_rtt_estimator(tp, seq_rtt);
+	tcp_rtt_estimator(tp, seq_rtt, usrtt);
 	tcp_set_rto(tp);
 	tp->backoff = 0;
 	tcp_bound_rto(tp);
 }
 
-static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
+static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag)
 {
 	/* We don't have a timestamp. Can only use
 	 * packets that are not retransmitted to determine
@@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
 	if (flag & FLAG_RETRANS_DATA_ACKED)
 		return;
 
-	tcp_rtt_estimator(tp, seq_rtt);
+	tcp_rtt_estimator(tp, seq_rtt, usrtt);
 	tcp_set_rto(tp);
 	tp->backoff = 0;
 	tcp_bound_rto(tp);
 }
 
 static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
-				      int flag, s32 seq_rtt)
+				      int flag, s32 seq_rtt, u32 *usrtt)
 {
 	/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
 	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
-		tcp_ack_saw_tstamp(tp, flag);
+		tcp_ack_saw_tstamp(tp, usrtt, flag);
 	else if (seq_rtt >= 0)
-		tcp_ack_no_tstamp(tp, seq_rtt, flag);
+		tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag);
 }
 
-/*
- * Compute congestion window to use.
- *
- * This is from the implementation of BICTCP in
- * Lison-Xu, Kahaled Harfoush, and Injog Rhee.
- *  "Binary Increase Congestion Control for Fast, Long Distance
- *  Networks" in InfoComm 2004
- * Available from:
- *  http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
- *
- * Unless BIC is enabled and congestion window is large
- * this behaves the same as the original Reno.
- */
-static inline __u32 bictcp_cwnd(struct tcp_sock *tp)
-{
-	/* orignal Reno behaviour */
-	if (!tcp_is_bic(tp))
-		return tp->snd_cwnd;
-
-	if (tp->bictcp.last_cwnd == tp->snd_cwnd &&
-	   (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))
-		return tp->bictcp.cnt;
-
-	tp->bictcp.last_cwnd = tp->snd_cwnd;
-	tp->bictcp.last_stamp = tcp_time_stamp;
-      
-	/* start off normal */
-	if (tp->snd_cwnd <= sysctl_tcp_bic_low_window)
-		tp->bictcp.cnt = tp->snd_cwnd;
-
-	/* binary increase */
-	else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) {
-		__u32 	dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd)
-			/ BICTCP_B;
-
-		if (dist > BICTCP_MAX_INCREMENT)
-			/* linear increase */
-			tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
-		else if (dist <= 1U)
-			/* binary search increase */
-			tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
-				/ BICTCP_B;
-		else
-			/* binary search increase */
-			tp->bictcp.cnt = tp->snd_cwnd / dist;
-	} else {
-		/* slow start amd linear increase */
-		if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B)
-			/* slow start */
-			tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
-				/ BICTCP_B;
-		else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd
-			 		+ BICTCP_MAX_INCREMENT*(BICTCP_B-1))
-			/* slow start */
-			tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1)
-				/ (tp->snd_cwnd-tp->bictcp.last_max_cwnd);
-		else
-			/* linear increase */
-			tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
-	}
-	return tp->bictcp.cnt;
-}
-
-/* This is Jacobson's slow start and congestion avoidance. 
- * SIGCOMM '88, p. 328.
- */
-static inline void reno_cong_avoid(struct tcp_sock *tp)
+static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+				  u32 in_flight, int good)
 {
-        if (tp->snd_cwnd <= tp->snd_ssthresh) {
-                /* In "safe" area, increase. */
-		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
-			tp->snd_cwnd++;
-	} else {
-                /* In dangerous area, increase slowly.
-		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
-		 */
-		if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {
-			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
-				tp->snd_cwnd++;
-			tp->snd_cwnd_cnt=0;
-		} else
-			tp->snd_cwnd_cnt++;
-        }
+	tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-/* This is based on the congestion detection/avoidance scheme described in
- *    Lawrence S. Brakmo and Larry L. Peterson.
- *    "TCP Vegas: End to end congestion avoidance on a global internet."
- *    IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
- *    October 1995. Available from:
- *	ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
- *
- * See http://www.cs.arizona.edu/xkernel/ for their implementation.
- * The main aspects that distinguish this implementation from the
- * Arizona Vegas implementation are:
- *   o We do not change the loss detection or recovery mechanisms of
- *     Linux in any way. Linux already recovers from losses quite well,
- *     using fine-grained timers, NewReno, and FACK.
- *   o To avoid the performance penalty imposed by increasing cwnd
- *     only every-other RTT during slow start, we increase during
- *     every RTT during slow start, just like Reno.
- *   o Largely to allow continuous cwnd growth during slow start,
- *     we use the rate at which ACKs come back as the "actual"
- *     rate, rather than the rate at which data is sent.
- *   o To speed convergence to the right rate, we set the cwnd
- *     to achieve the right ("actual") rate when we exit slow start.
- *   o To filter out the noise caused by delayed ACKs, we use the
- *     minimum RTT sample observed during the last RTT to calculate
- *     the actual rate.
- *   o When the sender re-starts from idle, it waits until it has
- *     received ACKs for an entire flight of new data before making
- *     a cwnd adjustment decision. The original Vegas implementation
- *     assumed senders never went idle.
- */
-static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
-{
-	/* The key players are v_beg_snd_una and v_beg_snd_nxt.
-	 *
-	 * These are so named because they represent the approximate values
-	 * of snd_una and snd_nxt at the beginning of the current RTT. More
-	 * precisely, they represent the amount of data sent during the RTT.
-	 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
-	 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
-	 * bytes of data have been ACKed during the course of the RTT, giving
-	 * an "actual" rate of:
-	 *
-	 *     (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
-	 *
-	 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
-	 * because delayed ACKs can cover more than one segment, so they
-	 * don't line up nicely with the boundaries of RTTs.
-	 *
-	 * Another unfortunate fact of life is that delayed ACKs delay the
-	 * advance of the left edge of our send window, so that the number
-	 * of bytes we send in an RTT is often less than our cwnd will allow.
-	 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
-	 */
-
-	if (after(ack, tp->vegas.beg_snd_nxt)) {
-		/* Do the Vegas once-per-RTT cwnd adjustment. */
-		u32 old_wnd, old_snd_cwnd;
-
-		
-		/* Here old_wnd is essentially the window of data that was
-		 * sent during the previous RTT, and has all
-		 * been acknowledged in the course of the RTT that ended
-		 * with the ACK we just received. Likewise, old_snd_cwnd
-		 * is the cwnd during the previous RTT.
-		 */
-		old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
-			tp->mss_cache_std;
-		old_snd_cwnd = tp->vegas.beg_snd_cwnd;
-
-		/* Save the extent of the current window so we can use this
-		 * at the end of the next RTT.
-		 */
-		tp->vegas.beg_snd_una  = tp->vegas.beg_snd_nxt;
-		tp->vegas.beg_snd_nxt  = tp->snd_nxt;
-		tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
-
-		/* Take into account the current RTT sample too, to
-		 * decrease the impact of delayed acks. This double counts
-		 * this sample since we count it for the next window as well,
-		 * but that's not too awful, since we're taking the min,
-		 * rather than averaging.
-		 */
-		vegas_rtt_calc(tp, seq_rtt);
-
-		/* We do the Vegas calculations only if we got enough RTT
-		 * samples that we can be reasonably sure that we got
-		 * at least one RTT sample that wasn't from a delayed ACK.
-		 * If we only had 2 samples total,
-		 * then that means we're getting only 1 ACK per RTT, which
-		 * means they're almost certainly delayed ACKs.
-		 * If  we have 3 samples, we should be OK.
-		 */
-
-		if (tp->vegas.cntRTT <= 2) {
-			/* We don't have enough RTT samples to do the Vegas
-			 * calculation, so we'll behave like Reno.
-			 */
-			if (tp->snd_cwnd > tp->snd_ssthresh)
-				tp->snd_cwnd++;
-		} else {
-			u32 rtt, target_cwnd, diff;
-
-			/* We have enough RTT samples, so, using the Vegas
-			 * algorithm, we determine if we should increase or
-			 * decrease cwnd, and by how much.
-			 */
-
-			/* Pluck out the RTT we are using for the Vegas
-			 * calculations. This is the min RTT seen during the
-			 * last RTT. Taking the min filters out the effects
-			 * of delayed ACKs, at the cost of noticing congestion
-			 * a bit later.
-			 */
-			rtt = tp->vegas.minRTT;
-
-			/* Calculate the cwnd we should have, if we weren't
-			 * going too fast.
-			 *
-			 * This is:
-			 *     (actual rate in segments) * baseRTT
-			 * We keep it as a fixed point number with
-			 * V_PARAM_SHIFT bits to the right of the binary point.
-			 */
-			target_cwnd = ((old_wnd * tp->vegas.baseRTT)
-				       << V_PARAM_SHIFT) / rtt;
-
-			/* Calculate the difference between the window we had,
-			 * and the window we would like to have. This quantity
-			 * is the "Diff" from the Arizona Vegas papers.
-			 *
-			 * Again, this is a fixed point number with
-			 * V_PARAM_SHIFT bits to the right of the binary
-			 * point.
-			 */
-			diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
-
-			if (tp->snd_cwnd < tp->snd_ssthresh) {
-				/* Slow start.  */
-				if (diff > sysctl_tcp_vegas_gamma) {
-					/* Going too fast. Time to slow down
-					 * and switch to congestion avoidance.
-					 */
-					tp->snd_ssthresh = 2;
-
-					/* Set cwnd to match the actual rate
-					 * exactly:
-					 *   cwnd = (actual rate) * baseRTT
-					 * Then we add 1 because the integer
-					 * truncation robs us of full link
-					 * utilization.
-					 */
-					tp->snd_cwnd = min(tp->snd_cwnd,
-							   (target_cwnd >>
-							    V_PARAM_SHIFT)+1);
-
-				}
-			} else {
-				/* Congestion avoidance. */
-				u32 next_snd_cwnd;
-
-				/* Figure out where we would like cwnd
-				 * to be.
-				 */
-				if (diff > sysctl_tcp_vegas_beta) {
-					/* The old window was too fast, so
-					 * we slow down.
-					 */
-					next_snd_cwnd = old_snd_cwnd - 1;
-				} else if (diff < sysctl_tcp_vegas_alpha) {
-					/* We don't have enough extra packets
-					 * in the network, so speed up.
-					 */
-					next_snd_cwnd = old_snd_cwnd + 1;
-				} else {
-					/* Sending just as fast as we
-					 * should be.
-					 */
-					next_snd_cwnd = old_snd_cwnd;
-				}
-
-				/* Adjust cwnd upward or downward, toward the
-				 * desired value.
-				 */
-				if (next_snd_cwnd > tp->snd_cwnd)
-					tp->snd_cwnd++;
-				else if (next_snd_cwnd < tp->snd_cwnd)
-					tp->snd_cwnd--;
-			}
-		}
-
-		/* Wipe the slate clean for the next RTT. */
-		tp->vegas.cntRTT = 0;
-		tp->vegas.minRTT = 0x7fffffff;
-	}
-
-	/* The following code is executed for every ack we receive,
-	 * except for conditions checked in should_advance_cwnd()
-	 * before the call to tcp_cong_avoid(). Mainly this means that
-	 * we only execute this code if the ack actually acked some
-	 * data.
-	 */
-
-	/* If we are in slow start, increase our cwnd in response to this ACK.
-	 * (If we are not in slow start then we are in congestion avoidance,
-	 * and adjust our congestion window only once per RTT. See the code
-	 * above.)
-	 */
-	if (tp->snd_cwnd <= tp->snd_ssthresh) 
-		tp->snd_cwnd++;
-
-	/* to keep cwnd from growing without bound */
-	tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
-
-	/* Make sure that we are never so timid as to reduce our cwnd below
-	 * 2 MSS.
-	 *
-	 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
-	 */
-	tp->snd_cwnd = max(tp->snd_cwnd, 2U);
-
-	tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-
-static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
-{
-	if (tcp_vegas_enabled(tp))
-		vegas_cong_avoid(tp, ack, seq_rtt);
-	else
-		reno_cong_avoid(tp);
-}
-
 /* Restart timer after forward progress on connection.
  * RFC2988 recommends to restart timer to now+rto.
  */
@@ -2415,13 +2024,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
 
 
 /* Remove acknowledged frames from the retransmission queue. */
-static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
+static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	__u32 now = tcp_time_stamp;
 	int acked = 0;
 	__s32 seq_rtt = -1;
+	struct timeval usnow;
+	u32 pkts_acked = 0;
+
+	if (seq_usrtt)
+		do_gettimeofday(&usnow);
 
 	while ((skb = skb_peek(&sk->sk_write_queue)) &&
 	       skb != sk->sk_send_head) {
@@ -2448,6 +2062,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 		 */
 		if (!(scb->flags & TCPCB_FLAG_SYN)) {
 			acked |= FLAG_DATA_ACKED;
+			++pkts_acked;
 		} else {
 			acked |= FLAG_SYN_ACKED;
 			tp->retrans_stamp = 0;
@@ -2461,6 +2076,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 				seq_rtt = -1;
 			} else if (seq_rtt < 0)
 				seq_rtt = now - scb->when;
+			if (seq_usrtt)
+				*seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000
+					+ (usnow.tv_usec - skb->stamp.tv_usec);
+
 			if (sacked & TCPCB_SACKED_ACKED)
 				tp->sacked_out -= tcp_skb_pcount(skb);
 			if (sacked & TCPCB_LOST)
@@ -2479,8 +2098,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 	}
 
 	if (acked&FLAG_ACKED) {
-		tcp_ack_update_rtt(tp, acked, seq_rtt);
+		tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt);
 		tcp_ack_packets_out(sk, tp);
+
+		if (tp->ca_ops->pkts_acked)
+			tp->ca_ops->pkts_acked(tp, pkts_acked);
 	}
 
 #if FASTRETRANS_DEBUG > 0
@@ -2624,257 +2246,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
 	tp->frto_counter = (tp->frto_counter + 1) % 3;
 }
 
-/*
- * TCP Westwood+
- */
-
-/*
- * @init_westwood
- * This function initializes fields used in TCP Westwood+. We can't
- * get no information about RTTmin at this time so we simply set it to
- * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
- * since in this way we're sure it will be updated in a consistent
- * way as soon as possible. It will reasonably happen within the first
- * RTT period of the connection lifetime.
- */
-
-static void init_westwood(struct sock *sk)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-
-        tp->westwood.bw_ns_est = 0;
-        tp->westwood.bw_est = 0;
-        tp->westwood.accounted = 0;
-        tp->westwood.cumul_ack = 0;
-        tp->westwood.rtt_win_sx = tcp_time_stamp;
-        tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT;
-        tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT;
-        tp->westwood.snd_una = tp->snd_una;
-}
-
-/*
- * @westwood_do_filter
- * Low-pass filter. Implemented using constant coeffients.
- */
-
-static inline __u32 westwood_do_filter(__u32 a, __u32 b)
-{
-	return (((7 * a) + b) >> 3);
-}
-
-static void westwood_filter(struct sock *sk, __u32 delta)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tp->westwood.bw_ns_est =
-		westwood_do_filter(tp->westwood.bw_ns_est, 
-				   tp->westwood.bk / delta);
-	tp->westwood.bw_est =
-		westwood_do_filter(tp->westwood.bw_est,
-				   tp->westwood.bw_ns_est);
-}
-
-/* 
- * @westwood_update_rttmin
- * It is used to update RTTmin. In this case we MUST NOT use
- * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN!
- */
-
-static inline __u32 westwood_update_rttmin(const struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	__u32 rttmin = tp->westwood.rtt_min;
-
-	if (tp->westwood.rtt != 0 &&
-	    (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin))
-		rttmin = tp->westwood.rtt;
-
-	return rttmin;
-}
-
-/*
- * @westwood_acked
- * Evaluate increases for dk. 
- */
-
-static inline __u32 westwood_acked(const struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-
-	return tp->snd_una - tp->westwood.snd_una;
-}
-
-/*
- * @westwood_new_window
- * It evaluates if we are receiving data inside the same RTT window as
- * when we started.
- * Return value:
- * It returns 0 if we are still evaluating samples in the same RTT
- * window, 1 if the sample has to be considered in the next window.
- */
-
-static int westwood_new_window(const struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	__u32 left_bound;
-	__u32 rtt;
-	int ret = 0;
-
-	left_bound = tp->westwood.rtt_win_sx;
-	rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN);
-
-	/*
-	 * A RTT-window has passed. Be careful since if RTT is less than
-	 * 50ms we don't filter but we continue 'building the sample'.
-	 * This minimum limit was choosen since an estimation on small
-	 * time intervals is better to avoid...
-	 * Obvioulsy on a LAN we reasonably will always have
-	 * right_bound = left_bound + WESTWOOD_RTT_MIN
-         */
-
-	if ((left_bound + rtt) < tcp_time_stamp)
-		ret = 1;
-
-	return ret;
-}
-
-/*
- * @westwood_update_window
- * It updates RTT evaluation window if it is the right moment to do
- * it. If so it calls filter for evaluating bandwidth. 
- */
-
-static void __westwood_update_window(struct sock *sk, __u32 now)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	__u32 delta = now - tp->westwood.rtt_win_sx;
-
-        if (delta) {
-		if (tp->westwood.rtt)
-			westwood_filter(sk, delta);
-
-		tp->westwood.bk = 0;
-		tp->westwood.rtt_win_sx = tcp_time_stamp;
-	}
-}
-
-
-static void westwood_update_window(struct sock *sk, __u32 now)
-{
-	if (westwood_new_window(sk)) 
-		__westwood_update_window(sk, now);
-}
-
-/*
- * @__tcp_westwood_fast_bw
- * It is called when we are in fast path. In particular it is called when
- * header prediction is successfull. In such case infact update is
- * straight forward and doesn't need any particular care.
- */
-
-static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	westwood_update_window(sk, tcp_time_stamp);
-
-	tp->westwood.bk += westwood_acked(sk);
-	tp->westwood.snd_una = tp->snd_una;
-	tp->westwood.rtt_min = westwood_update_rttmin(sk);
-}
-
-static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
-{
-        if (tcp_is_westwood(tcp_sk(sk)))
-                __tcp_westwood_fast_bw(sk, skb);
-}
-
-
-/*
- * @westwood_dupack_update
- * It updates accounted and cumul_ack when receiving a dupack.
- */
-
-static void westwood_dupack_update(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tp->westwood.accounted += tp->mss_cache_std;
-	tp->westwood.cumul_ack = tp->mss_cache_std;
-}
-
-static inline int westwood_may_change_cumul(struct tcp_sock *tp)
-{
-	return (tp->westwood.cumul_ack > tp->mss_cache_std);
-}
-
-static inline void westwood_partial_update(struct tcp_sock *tp)
-{
-	tp->westwood.accounted -= tp->westwood.cumul_ack;
-	tp->westwood.cumul_ack = tp->mss_cache_std;
-}
-
-static inline void westwood_complete_update(struct tcp_sock *tp)
-{
-	tp->westwood.cumul_ack -= tp->westwood.accounted;
-	tp->westwood.accounted = 0;
-}
-
-/*
- * @westwood_acked_count
- * This function evaluates cumul_ack for evaluating dk in case of
- * delayed or partial acks.
- */
-
-static inline __u32 westwood_acked_count(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tp->westwood.cumul_ack = westwood_acked(sk);
-
-        /* If cumul_ack is 0 this is a dupack since it's not moving
-         * tp->snd_una.
-         */
-        if (!(tp->westwood.cumul_ack))
-                westwood_dupack_update(sk);
-
-        if (westwood_may_change_cumul(tp)) {
-		/* Partial or delayed ack */
-		if (tp->westwood.accounted >= tp->westwood.cumul_ack)
-			westwood_partial_update(tp);
-		else
-			westwood_complete_update(tp);
-	}
-
-	tp->westwood.snd_una = tp->snd_una;
-
-	return tp->westwood.cumul_ack;
-}
-
-
-/*
- * @__tcp_westwood_slow_bw
- * It is called when something is going wrong..even if there could
- * be no problems! Infact a simple delayed packet may trigger a
- * dupack. But we need to be careful in such case.
- */
-
-static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	westwood_update_window(sk, tcp_time_stamp);
-
-	tp->westwood.bk += westwood_acked_count(sk);
-	tp->westwood.rtt_min = westwood_update_rttmin(sk);
-}
-
-static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
-{
-        if (tcp_is_westwood(tcp_sk(sk)))
-                __tcp_westwood_slow_bw(sk, skb);
-}
-
 /* This routine deals with incoming acks, but not outgoing ones. */
 static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 {
@@ -2884,6 +2255,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 	u32 ack = TCP_SKB_CB(skb)->ack_seq;
 	u32 prior_in_flight;
 	s32 seq_rtt;
+	s32 seq_usrtt = 0;
 	int prior_packets;
 
 	/* If the ack is newer than sent or older than previous acks
@@ -2902,9 +2274,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 		 */
 		tcp_update_wl(tp, ack, ack_seq);
 		tp->snd_una = ack;
-		tcp_westwood_fast_bw(sk, skb);
 		flag |= FLAG_WIN_UPDATE;
 
+		tcp_ca_event(tp, CA_EVENT_FAST_ACK);
+
 		NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
 	} else {
 		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
@@ -2920,7 +2293,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 		if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
 			flag |= FLAG_ECE;
 
-		tcp_westwood_slow_bw(sk,skb);
+		tcp_ca_event(tp, CA_EVENT_SLOW_ACK);
 	}
 
 	/* We passed data and got it acked, remove any soft error
@@ -2935,22 +2308,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 	prior_in_flight = tcp_packets_in_flight(tp);
 
 	/* See if we can take anything off of the retransmit queue. */
-	flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
+	flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
+				    tp->ca_ops->rtt_sample ? &seq_usrtt : NULL);
 
 	if (tp->frto_counter)
 		tcp_process_frto(sk, prior_snd_una);
 
 	if (tcp_ack_is_dubious(tp, flag)) {
 		/* Advanve CWND, if state allows this. */
-		if ((flag & FLAG_DATA_ACKED) &&
-		    (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) &&
-		    tcp_may_raise_cwnd(tp, flag))
-			tcp_cong_avoid(tp, ack, seq_rtt);
+		if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag))
+			tcp_cong_avoid(tp, ack,  seq_rtt, prior_in_flight, 0);
 		tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
 	} else {
-		if ((flag & FLAG_DATA_ACKED) && 
-		    (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd))
-			tcp_cong_avoid(tp, ack, seq_rtt);
+		if ((flag & FLAG_DATA_ACKED))
+			tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1);
 	}
 
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -4552,6 +3923,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 
 		tcp_init_metrics(sk);
 
+		tcp_init_congestion_control(tp);
+
 		/* Prevent spurious tcp_cwnd_restart() on first data
 		 * packet.
 		 */
@@ -4708,9 +4081,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			if(tp->af_specific->conn_request(sk, skb) < 0)
 				return 1;
 
-			init_westwood(sk);
-			init_bictcp(tp);
-
 			/* Now we have several options: In theory there is 
 			 * nothing else in the frame. KA9Q has an option to 
 			 * send data with the syn, BSD accepts data with the
@@ -4732,9 +4102,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		goto discard;
 
 	case TCP_SYN_SENT:
-		init_westwood(sk);
-		init_bictcp(tp);
-
 		queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
 		if (queued >= 0)
 			return queued;
@@ -4816,7 +4183,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				 */
 				if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
 				    !tp->srtt)
-					tcp_ack_saw_tstamp(tp, 0);
+					tcp_ack_saw_tstamp(tp, 0, 0);
 
 				if (tp->rx_opt.tstamp_ok)
 					tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4828,6 +4195,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 				tcp_init_metrics(sk);
 
+				tcp_init_congestion_control(tp);
+
 				/* Prevent spurious tcp_cwnd_restart() on
 				 * first data packet.
 				 */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2d41d5d6ad19..9122814c13ad 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2048,6 +2048,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	tp->mss_cache_std = tp->mss_cache = 536;
 
 	tp->reordering = sysctl_tcp_reordering;
+	tp->ca_ops = &tcp_reno;
 
 	sk->sk_state = TCP_CLOSE;
 
@@ -2070,6 +2071,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
 
 	tcp_clear_xmit_timers(sk);
 
+	tcp_cleanup_congestion_control(tp);
+
 	/* Cleanup up the write buffer. */
   	sk_stream_writequeue_purge(sk);
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b3943e7562f3..f42a284164b7 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -774,6 +774,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->frto_counter = 0;
 		newtp->frto_highmark = 0;
 
+		newtp->ca_ops = &tcp_reno;
+
 		tcp_set_ca_state(newtp, TCP_CA_Open);
 		tcp_init_xmit_timers(newsk);
 		skb_queue_head_init(&newtp->out_of_order_queue);
@@ -842,8 +844,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		if (newtp->ecn_flags&TCP_ECN_OK)
 			sock_set_flag(newsk, SOCK_NO_LARGESEND);
 
-		tcp_ca_init(newtp);
-
 		TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
 	}
 	return newsk;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f17c6577e337..0e17c244875c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
 	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
 	u32 cwnd = tp->snd_cwnd;
 
-	if (tcp_is_vegas(tp)) 
-		tcp_vegas_enable(tp);
+	tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
 
 	tp->snd_ssthresh = tcp_current_ssthresh(tp);
 	restart_cwnd = min(restart_cwnd, cwnd);
@@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 #define SYSCTL_FLAG_WSCALE	0x2
 #define SYSCTL_FLAG_SACK	0x4
 
+		/* If congestion control is doing timestamping */
+		if (tp->ca_ops->rtt_sample)
+			do_gettimeofday(&skb->stamp);
+
 		sysctl_flags = 0;
 		if (tcb->flags & TCPCB_FLAG_SYN) {
 			tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
@@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 					    (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
 		}
 		
-		/*
-		 * If the connection is idle and we are restarting,
-		 * then we don't want to do any Vegas calculations
-		 * until we get fresh RTT samples.  So when we
-		 * restart, we reset our Vegas state to a clean
-		 * slate. After we get acks for this flight of
-		 * packets, _then_ we can make Vegas calculations
-		 * again.
-		 */
-		if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
-			tcp_vegas_enable(tp);
+		if (tcp_packets_in_flight(tp) == 0)
+			tcp_ca_event(tp, CA_EVENT_TX_START);
 
 		th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 		skb->h.th = th;
@@ -521,6 +515,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 	 * skbs, which it never sent before. --ANK
 	 */
 	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
+	buff->stamp = skb->stamp;
 
 	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
 		tp->lost_out -= tcp_skb_pcount(skb);
@@ -1449,7 +1444,6 @@ static inline void tcp_connect_init(struct sock *sk)
 		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
 	tp->advmss = dst_metric(dst, RTAX_ADVMSS);
 	tcp_initialize_rcv_mss(sk);
-	tcp_ca_init(tp);
 
 	tcp_select_initial_window(tcp_full_space(sk),
 				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
@@ -1503,7 +1497,6 @@ int tcp_connect(struct sock *sk)
 	TCP_SKB_CB(buff)->end_seq = tp->write_seq;
 	tp->snd_nxt = tp->write_seq;
 	tp->pushed_seq = tp->write_seq;
-	tcp_ca_init(tp);
 
 	/* Send it off. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2414937f2a83..fce56039b0e9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2025,7 +2025,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 	sk->sk_state = TCP_CLOSE;
 
 	tp->af_specific = &ipv6_specific;
-
+	tp->ca_ops = &tcp_reno;
 	sk->sk_write_space = sk_stream_write_space;
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 
-- 
cgit 


From 7c99c909fa69a183c1b80bd64fb9f0d11459aff3 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:20:36 -0700
Subject: [TCP]: Change tcp_diag to use the existing __RTA_PUT() macro.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_diag.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 867acc0f79d8..a4e512036d88 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -43,13 +43,7 @@ struct tcpdiag_entry
 static struct sock *tcpnl;
 
 #define TCPDIAG_PUT(skb, attrtype, attrlen) \
-({ int rtalen = RTA_LENGTH(attrlen);        \
-   struct rtattr *rta;                      \
-   if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \
-   rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \
-   rta->rta_type = attrtype;                \
-   rta->rta_len = rtalen;                   \
-   RTA_DATA(rta); })
+	RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
 
 static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
 			int ext, u32 pid, u32 seq, u16 nlmsg_flags)
@@ -167,6 +161,7 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
 	nlh->nlmsg_len = skb->tail - b;
 	return skb->len;
 
+rtattr_failure:
 nlmsg_failure:
 	skb_trim(skb, b - skb->data);
 	return -1;
-- 
cgit 


From 056ede6cface66b400cd3b8e60ed077cc5b85c18 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:21:28 -0700
Subject: [TCP]: Report congestion control algorithm in tcp_diag.

Enhancement to the tcp_diag interface used by the iproute2 ss command
to report the tcp congestion control being used by a socket.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_diag.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index a4e512036d88..f66945cb158f 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -65,6 +65,11 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
 		if (ext & (1<<(TCPDIAG_INFO-1)))
 			info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
 		
+		if (ext & (1<<(TCPDIAG_CONG-1))) {
+			size_t len = strlen(tp->ca_ops->name);
+			strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1),
+			       tp->ca_ops->name);
+		}
 	}
 	r->tcpdiag_family = sk->sk_family;
 	r->tcpdiag_state = sk->sk_state;
-- 
cgit 


From 83803034f4233d810c4adc52008921da060c55d1 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:23:25 -0700
Subject: [TCP]: Add TCP BIC congestion control module.

TCP BIC congestion control reworked to use the new congestion control
infrastructure. This version is more up to date than the BIC
code in 2.6.12; it incorporates enhancements from BICTCP 1.1,
to handle low latency links.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig   |  21 ++++
 net/ipv4/Makefile  |   1 +
 net/ipv4/tcp_bic.c | 331 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 353 insertions(+)
 create mode 100644 net/ipv4/tcp_bic.c

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 567b03b1c349..712ebacacb62 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -433,5 +433,26 @@ config IP_TCPDIAG
 config IP_TCPDIAG_IPV6
 	def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
 
+# TCP Reno is builtin (required as fallback)
+menu "TCP congestion control"
+	depends on INET
+
+config TCP_CONG_BIC
+	tristate "Binary Increase Congestion (BIC) control"
+	depends on INET
+	default y
+	---help---
+	BIC-TCP is a sender-side only change that ensures a linear RTT
+	fairness under large windows while offering both scalability and
+	bounded TCP-friendliness. The protocol combines two schemes
+	called additive increase and binary search increase. When the
+	congestion window is large, additive increase with a large
+	increment ensures linear RTT fairness as well as good
+	scalability. Under small congestion windows, binary search
+	increase provides TCP friendliness.
+	See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+
+endmenu
+
 source "net/ipv4/ipvs/Kconfig"
 
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 89c0b4cb470e..1d1cac5ac06a 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_NETFILTER)	+= netfilter/
 obj-$(CONFIG_IP_VS) += ipvs/
 obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
+obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
new file mode 100644
index 000000000000..ec38d45d6649
--- /dev/null
+++ b/net/ipv4/tcp_bic.c
@@ -0,0 +1,331 @@
+/*
+ * Binary Increase Congestion control for TCP
+ *
+ * This is from the implementation of BICTCP in
+ * Lison-Xu, Kahaled Harfoush, and Injong Rhee.
+ *  "Binary Increase Congestion Control for Fast, Long Distance
+ *  Networks" in InfoComm 2004
+ * Available from:
+ *  http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
+ *
+ * Unless BIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+
+#define BICTCP_BETA_SCALE    1024	/* Scale factor beta calculation
+					 * max_cwnd = snd_cwnd * beta
+					 */
+#define BICTCP_B		4	 /*
+					  * In binary search,
+					  * go to point (max+min)/N
+					  */
+
+static int fast_convergence = 1;
+static int max_increment = 32;
+static int low_window = 14;
+static int beta = 819;		/* = 819/1024 (BICTCP_BETA_SCALE) */
+static int low_utilization_threshold = 153;
+static int low_utilization_period = 2;
+static int initial_ssthresh = 100;
+static int smooth_part = 20;
+
+module_param(fast_convergence, int, 0644);
+MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
+module_param(max_increment, int, 0644);
+MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
+module_param(low_window, int, 0644);
+MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "beta for multiplicative increase");
+module_param(low_utilization_threshold, int, 0644);
+MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode");
+module_param(low_utilization_period, int, 0644);
+MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)");
+module_param(initial_ssthresh, int, 0644);
+MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
+module_param(smooth_part, int, 0644);
+MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
+
+
+/* BIC TCP Parameters */
+struct bictcp {
+	u32	cnt;		/* increase cwnd by 1 after ACKs */
+	u32 	last_max_cwnd;	/* last maximum snd_cwnd */
+	u32	loss_cwnd;	/* congestion window at last loss */
+	u32	last_cwnd;	/* the last snd_cwnd */
+	u32	last_time;	/* time when updated last_cwnd */
+	u32	delay_min;	/* min delay */
+	u32	delay_max;	/* max delay */
+	u32	last_delay;
+	u8	low_utilization;/* 0: high; 1: low */
+	u32	low_utilization_start;	/* starting time of low utilization detection*/
+	u32	epoch_start;	/* beginning of an epoch */
+#define ACK_RATIO_SHIFT	4
+	u32	delayed_ack;	/* estimate the ratio of Packets/ACKs << 4 */
+};
+
+static inline void bictcp_reset(struct bictcp *ca)
+{
+	ca->cnt = 0;
+	ca->last_max_cwnd = 0;
+	ca->loss_cwnd = 0;
+	ca->last_cwnd = 0;
+	ca->last_time = 0;
+	ca->delay_min = 0;
+	ca->delay_max = 0;
+	ca->last_delay = 0;
+	ca->low_utilization = 0;
+	ca->low_utilization_start = 0;
+	ca->epoch_start = 0;
+	ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
+}
+
+static void bictcp_init(struct tcp_sock *tp)
+{
+	bictcp_reset(tcp_ca(tp));
+	if (initial_ssthresh)
+		tp->snd_ssthresh = initial_ssthresh;
+}
+
+/*
+ * Compute congestion window to use.
+ */
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
+{
+	if (ca->last_cwnd == cwnd &&
+	    (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
+		return;
+
+	ca->last_cwnd = cwnd;
+	ca->last_time = tcp_time_stamp;
+
+	if (ca->epoch_start == 0) /* record the beginning of an epoch */
+		ca->epoch_start = tcp_time_stamp;
+
+	/* start off normal */
+	if (cwnd <= low_window) {
+		ca->cnt = cwnd;
+		return;
+	}
+
+	/* binary increase */
+	if (cwnd < ca->last_max_cwnd) {
+		__u32 	dist = (ca->last_max_cwnd - cwnd)
+			/ BICTCP_B;
+
+		if (dist > max_increment)
+			/* linear increase */
+			ca->cnt = cwnd / max_increment;
+		else if (dist <= 1U)
+			/* binary search increase */
+			ca->cnt = (cwnd * smooth_part) / BICTCP_B;
+		else
+			/* binary search increase */
+			ca->cnt = cwnd / dist;
+	} else {
+		/* slow start AMD linear increase */
+		if (cwnd < ca->last_max_cwnd + BICTCP_B)
+			/* slow start */
+			ca->cnt = (cwnd * smooth_part) / BICTCP_B;
+		else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
+			/* slow start */
+			ca->cnt = (cwnd * (BICTCP_B-1))
+				/ cwnd-ca->last_max_cwnd;
+		else
+			/* linear increase */
+			ca->cnt = cwnd / max_increment;
+	}
+
+	/* if in slow start or link utilization is very low */
+	if ( ca->loss_cwnd == 0 ||
+	     (cwnd > ca->loss_cwnd && ca->low_utilization)) {
+		if (ca->cnt > 20) /* increase cwnd 5% per RTT */
+			ca->cnt = 20;
+	}
+
+	ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
+	if (ca->cnt == 0)			/* cannot be zero */
+		ca->cnt = 1;
+}
+
+
+/* Detect low utilization in congestion avoidance */
+static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
+{
+	struct bictcp *ca = tcp_ca(tp);
+	u32 dist, delay;
+
+	/* No time stamp */
+	if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
+	     /* Discard delay samples right after fast recovery */
+	     tcp_time_stamp < ca->epoch_start + HZ ||
+	     /* this delay samples may not be accurate */
+	     flag == 0) {
+		ca->last_delay = 0;
+		goto notlow;
+	}
+
+	delay = ca->last_delay<<3;	/* use the same scale as tp->srtt*/
+	ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+	if (delay == 0) 		/* no previous delay sample */
+		goto notlow;
+
+	/* first time call or link delay decreases */
+	if (ca->delay_min == 0 || ca->delay_min > delay) {
+		ca->delay_min = ca->delay_max = delay;
+		goto notlow;
+	}
+
+	if (ca->delay_max < delay)
+		ca->delay_max = delay;
+
+	/* utilization is low, if avg delay < dist*threshold
+	   for checking_period time */
+	dist = ca->delay_max - ca->delay_min;
+	if (dist <= ca->delay_min>>6 ||
+	    tp->srtt - ca->delay_min >=  (dist*low_utilization_threshold)>>10)
+		goto notlow;
+
+	if (ca->low_utilization_start == 0) {
+		ca->low_utilization = 0;
+		ca->low_utilization_start = tcp_time_stamp;
+	} else if ((s32)(tcp_time_stamp - ca->low_utilization_start)
+			> low_utilization_period*HZ) {
+		ca->low_utilization = 1;
+	}
+
+	return;
+
+ notlow:
+	ca->low_utilization = 0;
+	ca->low_utilization_start = 0;
+
+}
+
+static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
+			      u32 seq_rtt, u32 in_flight, int data_acked)
+{
+	struct bictcp *ca = tcp_ca(tp);
+
+	bictcp_low_utilization(tp, data_acked);
+
+	if (in_flight < tp->snd_cwnd)
+		return;
+
+	if (tp->snd_cwnd <= tp->snd_ssthresh) {
+		/* In "safe" area, increase. */
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+			tp->snd_cwnd++;
+	} else {
+		bictcp_update(ca, tp->snd_cwnd);
+
+                /* In dangerous area, increase slowly.
+		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
+		 */
+		if (tp->snd_cwnd_cnt >= ca->cnt) {
+			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+				tp->snd_cwnd++;
+			tp->snd_cwnd_cnt = 0;
+		} else
+			tp->snd_cwnd_cnt++;
+	}
+
+}
+
+/*
+ *	behave like Reno until low_window is reached,
+ *	then increase congestion window slowly
+ */
+static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
+{
+	struct bictcp *ca = tcp_ca(tp);
+
+	ca->epoch_start = 0;	/* end of epoch */
+
+	/* in case of wrong delay_max*/
+	if (ca->delay_min > 0 && ca->delay_max > ca->delay_min)
+		ca->delay_max = ca->delay_min
+			+ ((ca->delay_max - ca->delay_min)* 90) / 100;
+
+	/* Wmax and fast convergence */
+	if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
+		ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+			/ (2 * BICTCP_BETA_SCALE);
+	else
+		ca->last_max_cwnd = tp->snd_cwnd;
+
+	ca->loss_cwnd = tp->snd_cwnd;
+
+
+	if (tp->snd_cwnd <= low_window)
+		return max(tp->snd_cwnd >> 1U, 2U);
+	else
+		return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+}
+
+static u32 bictcp_undo_cwnd(struct tcp_sock *tp)
+{
+	struct bictcp *ca = tcp_ca(tp);
+
+	return max(tp->snd_cwnd, ca->last_max_cwnd);
+}
+
+static u32 bictcp_min_cwnd(struct tcp_sock *tp)
+{
+	return tp->snd_ssthresh;
+}
+
+static void bictcp_state(struct tcp_sock *tp, u8 new_state)
+{
+	if (new_state == TCP_CA_Loss)
+		bictcp_reset(tcp_ca(tp));
+}
+
+/* Track delayed acknowledgement ratio using sliding window
+ * ratio = (15*ratio + sample) / 16
+ */
+static void bictcp_acked(struct tcp_sock *tp, u32 cnt)
+{
+	if (cnt > 0 && 	tp->ca_state == TCP_CA_Open) {
+		struct bictcp *ca = tcp_ca(tp);
+		cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
+		ca->delayed_ack += cnt;
+	}
+}
+
+
+static struct tcp_congestion_ops bictcp = {
+	.init		= bictcp_init,
+	.ssthresh	= bictcp_recalc_ssthresh,
+	.cong_avoid	= bictcp_cong_avoid,
+	.set_state	= bictcp_state,
+	.undo_cwnd	= bictcp_undo_cwnd,
+	.min_cwnd	= bictcp_min_cwnd,
+	.pkts_acked     = bictcp_acked,
+	.owner		= THIS_MODULE,
+	.name		= "bic",
+};
+
+static int __init bictcp_register(void)
+{
+	BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&bictcp);
+}
+
+static void __exit bictcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&bictcp);
+}
+
+module_init(bictcp_register);
+module_exit(bictcp_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("BIC TCP");
-- 
cgit 


From 8727076289ec55298a05cabddf02b374d13c1624 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:24:09 -0700
Subject: [TCP]: Add TCP Westwood congestion control module.

This is the existing 2.6.12 Westwood code moved from tcp_input
to the new congestion framework. A lot of the inline functions
have been eliminated to try and make it clearer.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig        |  15 +++
 net/ipv4/Makefile       |   1 +
 net/ipv4/tcp_westwood.c | 259 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 275 insertions(+)
 create mode 100644 net/ipv4/tcp_westwood.c

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 712ebacacb62..adbe855d931a 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -452,6 +452,21 @@ config TCP_CONG_BIC
 	increase provides TCP friendliness.
 	See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
 
+config TCP_CONG_WESTWOOD
+	tristate "TCP Westwood+"
+	depends on INET
+	default m
+	---help---
+	TCP Westwood+ is a sender-side only modification of the TCP Reno
+	protocol stack that optimizes the performance of TCP congestion
+	control. It is based on end-to-end bandwidth estimation to set
+	congestion window and slow start threshold after a congestion
+	episode. Using this estimation, TCP Westwood+ adaptively sets a
+	slow start threshold and a congestion window which takes into
+	account the bandwidth used  at the time congestion is experienced.
+	TCP Westwood+ significantly increases fairness wrt TCP Reno in
+	wired networks and throughput over wireless links.
+
 endmenu
 
 source "net/ipv4/ipvs/Kconfig"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 1d1cac5ac06a..dedfbe62a104 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_IP_VS) += ipvs/
 obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
new file mode 100644
index 000000000000..ef827242c940
--- /dev/null
+++ b/net/ipv4/tcp_westwood.c
@@ -0,0 +1,259 @@
+/*
+ * TCP Westwood+
+ *
+ *	Angelo Dell'Aera:	TCP Westwood+ support
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp_diag.h>
+#include <net/tcp.h>
+
+/* TCP Westwood structure */
+struct westwood {
+	u32    bw_ns_est;        /* first bandwidth estimation..not too smoothed 8) */
+	u32    bw_est;           /* bandwidth estimate */
+	u32    rtt_win_sx;       /* here starts a new evaluation... */
+	u32    bk;
+	u32    snd_una;          /* used for evaluating the number of acked bytes */
+	u32    cumul_ack;
+	u32    accounted;
+	u32    rtt;
+	u32    rtt_min;          /* minimum observed RTT */
+};
+
+
+/* TCP Westwood functions and constants */
+#define TCP_WESTWOOD_RTT_MIN   (HZ/20)	/* 50ms */
+#define TCP_WESTWOOD_INIT_RTT  (20*HZ)	/* maybe too conservative?! */
+
+/*
+ * @tcp_westwood_create
+ * This function initializes fields used in TCP Westwood+,
+ * it is called after the initial SYN, so the sequence numbers
+ * are correct but new passive connections we have no
+ * information about RTTmin at this time so we simply set it to
+ * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
+ * since in this way we're sure it will be updated in a consistent
+ * way as soon as possible. It will reasonably happen within the first
+ * RTT period of the connection lifetime.
+ */
+static void tcp_westwood_init(struct tcp_sock *tp)
+{
+	struct westwood *w = tcp_ca(tp);
+
+	w->bk = 0;
+        w->bw_ns_est = 0;
+        w->bw_est = 0;
+        w->accounted = 0;
+        w->cumul_ack = 0;
+	w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
+	w->rtt_win_sx = tcp_time_stamp;
+	w->snd_una = tp->snd_una;
+}
+
+/*
+ * @westwood_do_filter
+ * Low-pass filter. Implemented using constant coefficients.
+ */
+static inline u32 westwood_do_filter(u32 a, u32 b)
+{
+	return (((7 * a) + b) >> 3);
+}
+
+static inline void westwood_filter(struct westwood *w, u32 delta)
+{
+	w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
+	w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
+}
+
+/*
+ * @westwood_pkts_acked
+ * Called after processing group of packets.
+ * but all westwood needs is the last sample of srtt.
+ */
+static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
+{
+	struct westwood *w = tcp_ca(tp);
+	if (cnt > 0)
+		w->rtt = tp->srtt >> 3;
+}
+
+/*
+ * @westwood_update_window
+ * It updates RTT evaluation window if it is the right moment to do
+ * it. If so it calls filter for evaluating bandwidth.
+ */
+static void westwood_update_window(struct tcp_sock *tp)
+{
+	struct westwood *w = tcp_ca(tp);
+	s32 delta = tcp_time_stamp - w->rtt_win_sx;
+
+	/*
+	 * See if a RTT-window has passed.
+	 * Be careful since if RTT is less than
+	 * 50ms we don't filter but we continue 'building the sample'.
+	 * This minimum limit was chosen since an estimation on small
+	 * time intervals is better to avoid...
+	 * Obviously on a LAN we reasonably will always have
+	 * right_bound = left_bound + WESTWOOD_RTT_MIN
+	 */
+	if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) {
+		westwood_filter(w, delta);
+
+		w->bk = 0;
+		w->rtt_win_sx = tcp_time_stamp;
+	}
+}
+
+/*
+ * @westwood_fast_bw
+ * It is called when we are in fast path. In particular it is called when
+ * header prediction is successful. In such case in fact update is
+ * straight forward and doesn't need any particular care.
+ */
+static inline void westwood_fast_bw(struct tcp_sock *tp)
+{
+	struct westwood *w = tcp_ca(tp);
+
+	westwood_update_window(tp);
+
+	w->bk += tp->snd_una - w->snd_una;
+	w->snd_una = tp->snd_una;
+	w->rtt_min = min(w->rtt, w->rtt_min);
+}
+
+/*
+ * @westwood_acked_count
+ * This function evaluates cumul_ack for evaluating bk in case of
+ * delayed or partial acks.
+ */
+static inline u32 westwood_acked_count(struct tcp_sock *tp)
+{
+	struct westwood *w = tcp_ca(tp);
+
+	w->cumul_ack = tp->snd_una - w->snd_una;
+
+        /* If cumul_ack is 0 this is a dupack since it's not moving
+         * tp->snd_una.
+         */
+        if (!w->cumul_ack) {
+		w->accounted += tp->mss_cache;
+		w->cumul_ack = tp->mss_cache;
+	}
+
+        if (w->cumul_ack > tp->mss_cache) {
+		/* Partial or delayed ack */
+		if (w->accounted >= w->cumul_ack) {
+			w->accounted -= w->cumul_ack;
+			w->cumul_ack = tp->mss_cache;
+		} else {
+			w->cumul_ack -= w->accounted;
+			w->accounted = 0;
+		}
+	}
+
+	w->snd_una = tp->snd_una;
+
+	return w->cumul_ack;
+}
+
+static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
+{
+	struct westwood *w = tcp_ca(tp);
+	return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
+}
+
+/*
+ * TCP Westwood
+ * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it
+ * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
+ * so avoids ever returning 0.
+ */
+static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp)
+{
+	return westwood_bw_rttmin(tp);
+}
+
+static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
+{
+	struct westwood *w = tcp_ca(tp);
+
+	switch(event) {
+	case CA_EVENT_FAST_ACK:
+		westwood_fast_bw(tp);
+		break;
+
+	case CA_EVENT_COMPLETE_CWR:
+		tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp);
+		break;
+
+	case CA_EVENT_FRTO:
+		tp->snd_ssthresh = westwood_bw_rttmin(tp);
+		break;
+
+	case CA_EVENT_SLOW_ACK:
+		westwood_update_window(tp);
+		w->bk += westwood_acked_count(tp);
+		w->rtt_min = min(w->rtt, w->rtt_min);
+		break;
+
+	default:
+		/* don't care */
+		break;
+	}
+}
+
+
+/* Extract info for Tcp socket info provided via netlink. */
+static void tcp_westwood_info(struct tcp_sock *tp, u32 ext,
+			      struct sk_buff *skb)
+{
+	const struct westwood *ca = tcp_ca(tp);
+	if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
+		struct rtattr *rta;
+		struct tcpvegas_info *info;
+
+		rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info));
+		info = RTA_DATA(rta);
+		info->tcpv_enabled = 1;
+		info->tcpv_rttcnt = 0;
+		info->tcpv_rtt = jiffies_to_usecs(ca->rtt);
+		info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min);
+	rtattr_failure:	;
+	}
+}
+
+
+static struct tcp_congestion_ops tcp_westwood = {
+	.init		= tcp_westwood_init,
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.min_cwnd	= tcp_westwood_cwnd_min,
+	.cwnd_event	= tcp_westwood_event,
+	.get_info	= tcp_westwood_info,
+	.pkts_acked	= tcp_westwood_pkts_acked,
+
+	.owner		= THIS_MODULE,
+	.name		= "westwood"
+};
+
+static int __init tcp_westwood_register(void)
+{
+	BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_westwood);
+}
+
+static void __exit tcp_westwood_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_westwood);
+}
+
+module_init(tcp_westwood_register);
+module_exit(tcp_westwood_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Westwood+");
-- 
cgit 


From a628d29b56d3f420bf3ff1d7543a9caf3ce3b994 Mon Sep 17 00:00:00 2001
From: John Heffner <jheffner@psc.edu>
Date: Thu, 23 Jun 2005 12:24:58 -0700
Subject: [TCP]: Add High Speed TCP congestion control module.

Sally Floyd's high speed TCP congestion control.
This is useful for comparison and research.

Signed-off-by: John Heffner <jheffner@psc.edu>
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig         |  11 +++
 net/ipv4/Makefile        |   1 +
 net/ipv4/tcp_highspeed.c | 181 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 193 insertions(+)
 create mode 100644 net/ipv4/tcp_highspeed.c

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index adbe855d931a..910e25b65756 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -467,6 +467,17 @@ config TCP_CONG_WESTWOOD
 	TCP Westwood+ significantly increases fairness wrt TCP Reno in
 	wired networks and throughput over wireless links.
 
+config TCP_CONG_HSTCP
+	tristate "High Speed TCP"
+	depends on INET && EXPERIMENTAL
+	default n
+	---help---
+	Sally Floyd's High Speed TCP (RFC 3649) congestion control.
+	A modification to TCP's congestion control mechanism for use
+	with large congestion windows. A table indicates how much to
+	increase the congestion window by when an ACK is received.
+ 	For more detail	see http://www.icir.org/floyd/hstcp.html
+
 endmenu
 
 source "net/ipv4/ipvs/Kconfig"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index dedfbe62a104..ddf5d7785bf4 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
+obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
new file mode 100644
index 000000000000..36c51f8136bf
--- /dev/null
+++ b/net/ipv4/tcp_highspeed.c
@@ -0,0 +1,181 @@
+/*
+ * Sally Floyd's High Speed TCP (RFC 3649) congestion control
+ *
+ * See http://www.icir.org/floyd/hstcp.html
+ *
+ * John Heffner <jheffner@psc.edu>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+
+/* From AIMD tables from RFC 3649 appendix B,
+ * with fixed-point MD scaled <<8.
+ */
+static const struct hstcp_aimd_val {
+        unsigned int cwnd;
+        unsigned int md;
+} hstcp_aimd_vals[] = {
+ {     38,  128, /*  0.50 */ },
+ {    118,  112, /*  0.44 */ },
+ {    221,  104, /*  0.41 */ },
+ {    347,   98, /*  0.38 */ },
+ {    495,   93, /*  0.37 */ },
+ {    663,   89, /*  0.35 */ },
+ {    851,   86, /*  0.34 */ },
+ {   1058,   83, /*  0.33 */ },
+ {   1284,   81, /*  0.32 */ },
+ {   1529,   78, /*  0.31 */ },
+ {   1793,   76, /*  0.30 */ },
+ {   2076,   74, /*  0.29 */ },
+ {   2378,   72, /*  0.28 */ },
+ {   2699,   71, /*  0.28 */ },
+ {   3039,   69, /*  0.27 */ },
+ {   3399,   68, /*  0.27 */ },
+ {   3778,   66, /*  0.26 */ },
+ {   4177,   65, /*  0.26 */ },
+ {   4596,   64, /*  0.25 */ },
+ {   5036,   62, /*  0.25 */ },
+ {   5497,   61, /*  0.24 */ },
+ {   5979,   60, /*  0.24 */ },
+ {   6483,   59, /*  0.23 */ },
+ {   7009,   58, /*  0.23 */ },
+ {   7558,   57, /*  0.22 */ },
+ {   8130,   56, /*  0.22 */ },
+ {   8726,   55, /*  0.22 */ },
+ {   9346,   54, /*  0.21 */ },
+ {   9991,   53, /*  0.21 */ },
+ {  10661,   52, /*  0.21 */ },
+ {  11358,   52, /*  0.20 */ },
+ {  12082,   51, /*  0.20 */ },
+ {  12834,   50, /*  0.20 */ },
+ {  13614,   49, /*  0.19 */ },
+ {  14424,   48, /*  0.19 */ },
+ {  15265,   48, /*  0.19 */ },
+ {  16137,   47, /*  0.19 */ },
+ {  17042,   46, /*  0.18 */ },
+ {  17981,   45, /*  0.18 */ },
+ {  18955,   45, /*  0.18 */ },
+ {  19965,   44, /*  0.17 */ },
+ {  21013,   43, /*  0.17 */ },
+ {  22101,   43, /*  0.17 */ },
+ {  23230,   42, /*  0.17 */ },
+ {  24402,   41, /*  0.16 */ },
+ {  25618,   41, /*  0.16 */ },
+ {  26881,   40, /*  0.16 */ },
+ {  28193,   39, /*  0.16 */ },
+ {  29557,   39, /*  0.15 */ },
+ {  30975,   38, /*  0.15 */ },
+ {  32450,   38, /*  0.15 */ },
+ {  33986,   37, /*  0.15 */ },
+ {  35586,   36, /*  0.14 */ },
+ {  37253,   36, /*  0.14 */ },
+ {  38992,   35, /*  0.14 */ },
+ {  40808,   35, /*  0.14 */ },
+ {  42707,   34, /*  0.13 */ },
+ {  44694,   33, /*  0.13 */ },
+ {  46776,   33, /*  0.13 */ },
+ {  48961,   32, /*  0.13 */ },
+ {  51258,   32, /*  0.13 */ },
+ {  53677,   31, /*  0.12 */ },
+ {  56230,   30, /*  0.12 */ },
+ {  58932,   30, /*  0.12 */ },
+ {  61799,   29, /*  0.12 */ },
+ {  64851,   28, /*  0.11 */ },
+ {  68113,   28, /*  0.11 */ },
+ {  71617,   27, /*  0.11 */ },
+ {  75401,   26, /*  0.10 */ },
+ {  79517,   26, /*  0.10 */ },
+ {  84035,   25, /*  0.10 */ },
+ {  89053,   24, /*  0.10 */ },
+};
+
+#define HSTCP_AIMD_MAX	ARRAY_SIZE(hstcp_aimd_vals)
+
+struct hstcp {
+	u32	ai;
+};
+
+static void hstcp_init(struct tcp_sock *tp)
+{
+	struct hstcp *ca = tcp_ca(tp);
+
+	ca->ai = 0;
+
+	/* Ensure the MD arithmetic works.  This is somewhat pedantic,
+	 * since I don't think we will see a cwnd this large. :) */
+	tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
+}
+
+static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
+			     u32 in_flight, int good)
+{
+	struct hstcp *ca = tcp_ca(tp);
+
+	if (in_flight < tp->snd_cwnd)
+		return;
+
+	if (tp->snd_cwnd <= tp->snd_ssthresh) {
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+			tp->snd_cwnd++;
+	} else {
+		/* Update AIMD parameters */
+		if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
+			while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
+			       ca->ai < HSTCP_AIMD_MAX)
+				ca->ai++;
+		} else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) {
+			while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
+			       ca->ai > 0)
+				ca->ai--;
+		}
+
+		/* Do additive increase */
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
+			tp->snd_cwnd_cnt += ca->ai;
+			if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+				tp->snd_cwnd++;
+				tp->snd_cwnd_cnt -= tp->snd_cwnd;
+			}
+		}
+	}
+}
+
+static u32 hstcp_ssthresh(struct tcp_sock *tp)
+{
+	struct hstcp *ca = tcp_ca(tp);
+
+	/* Do multiplicative decrease */
+	return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
+}
+
+
+static struct tcp_congestion_ops tcp_highspeed = {
+	.init		= hstcp_init,
+	.ssthresh	= hstcp_ssthresh,
+	.cong_avoid	= hstcp_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+
+	.owner		= THIS_MODULE,
+	.name		= "highspeed"
+};
+
+static int __init hstcp_register(void)
+{
+	BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_highspeed);
+}
+
+static void __exit hstcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_highspeed);
+}
+
+module_init(hstcp_register);
+module_exit(hstcp_unregister);
+
+MODULE_AUTHOR("John Heffner");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("High Speed TCP");
-- 
cgit 


From 835b3f0c0d7e1f716c45ec576662eac7a68b8548 Mon Sep 17 00:00:00 2001
From: Daniele Lacamera <(root at danielinux.net)net>
Date: Thu, 23 Jun 2005 12:26:34 -0700
Subject: [TCP]: Add TCP Hybla congestion control module.

TCP Hybla congestion avoidance.

- "In heterogeneous networks, TCP connections that incorporate a
terrestrial or satellite radio link are greatly disadvantaged with
respect to entirely wired connections, because of their longer round
trip times (RTTs). To cope with this problem, a new TCP proposal, the
TCP Hybla, is presented and discussed in the paper[1]. It stems from an
analytical evaluation of the congestion window dynamics in the TCP
standard versions (Tahoe, Reno, NewReno), which suggests the necessary
modifications to remove the performance dependence on RTT.[...]"[1]

[1]: Carlo Caini, Rosario Firrincieli, "TCP Hybla: a TCP enhancement for
heterogeneous networks",
International Journal of Satellite Communications and Networking
Volume 22, Issue 5 , Pages 547 - 566. September 2004.

Signed-off-by: Daniele Lacamera (root at danielinux.net)net
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig     |  10 +++
 net/ipv4/Makefile    |   1 +
 net/ipv4/tcp_hybla.c | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 198 insertions(+)
 create mode 100644 net/ipv4/tcp_hybla.c

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 910e25b65756..516ffe842816 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -478,6 +478,16 @@ config TCP_CONG_HSTCP
 	increase the congestion window by when an ACK is received.
  	For more detail	see http://www.icir.org/floyd/hstcp.html
 
+config TCP_CONG_HYBLA
+	tristate "TCP-Hybla congestion control algorithm"
+	depends on INET && EXPERIMENTAL
+	default n
+	---help---
+	TCP-Hybla is a sender-side only change that eliminates penalization of
+	long-RTT, large-bandwidth connections, like when satellite legs are
+	involved, expecially when sharing a common bottleneck with normal
+	terrestrial connections.
+
 endmenu
 
 source "net/ipv4/ipvs/Kconfig"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ddf5d7785bf4..ede7a5d617ff 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
 obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
+obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
new file mode 100644
index 000000000000..13a66342c304
--- /dev/null
+++ b/net/ipv4/tcp_hybla.c
@@ -0,0 +1,187 @@
+/*
+ * TCP HYBLA
+ *
+ * TCP-HYBLA Congestion control algorithm, based on:
+ *   C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement
+ *   for Heterogeneous Networks",
+ *   International Journal on satellite Communications,
+ *				       September 2004
+ *    Daniele Lacamera
+ *    root at danielinux.net
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* Tcp Hybla structure. */
+struct hybla {
+	u8    hybla_en;
+	u32   snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
+	u32   rho;	      /* Rho parameter, integer part  */
+	u32   rho2;	      /* Rho * Rho, integer part */
+	u32   rho_3ls;	      /* Rho parameter, <<3 */
+	u32   rho2_7ls;	      /* Rho^2, <<7	*/
+	u32   minrtt;	      /* Minimum smoothed round trip time value seen */
+};
+
+/* Hybla reference round trip time (default= 1/40 sec = 25 ms),
+   expressed in jiffies */
+static int rtt0 = 25;
+module_param(rtt0, int, 0644);
+MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
+
+
+/* This is called to refresh values for hybla parameters */
+static inline void hybla_recalc_param (struct tcp_sock *tp)
+{
+	struct hybla *ca = tcp_ca(tp);
+
+	ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8);
+	ca->rho = ca->rho_3ls >> 3;
+	ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
+	ca->rho2 = ca->rho2_7ls >>7;
+}
+
+static void hybla_init(struct tcp_sock *tp)
+{
+	struct hybla *ca = tcp_ca(tp);
+
+	ca->rho = 0;
+	ca->rho2 = 0;
+	ca->rho_3ls = 0;
+	ca->rho2_7ls = 0;
+	ca->snd_cwnd_cents = 0;
+	ca->hybla_en = 1;
+	tp->snd_cwnd = 2;
+	tp->snd_cwnd_clamp = 65535;
+
+	/* 1st Rho measurement based on initial srtt */
+	hybla_recalc_param(tp);
+
+	/* set minimum rtt as this is the 1st ever seen */
+	ca->minrtt = tp->srtt;
+	tp->snd_cwnd = ca->rho;
+}
+
+static void hybla_state(struct tcp_sock *tp, u8 ca_state)
+{
+	struct hybla *ca = tcp_ca(tp);
+
+	ca->hybla_en = (ca_state == TCP_CA_Open);
+}
+
+static inline u32 hybla_fraction(u32 odds)
+{
+	static const u32 fractions[] = {
+		128, 139, 152, 165, 181, 197, 215, 234,
+	};
+
+	return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128;
+}
+
+/* TCP Hybla main routine.
+ * This is the algorithm behavior:
+ *     o Recalc Hybla parameters if min_rtt has changed
+ *     o Give cwnd a new value based on the model proposed
+ *     o remember increments <1
+ */
+static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+			    u32 in_flight, int flag)
+{
+	struct hybla *ca = tcp_ca(tp);
+	u32 increment, odd, rho_fractions;
+	int is_slowstart = 0;
+
+	/*  Recalculate rho only if this srtt is the lowest */
+	if (tp->srtt < ca->minrtt){
+		hybla_recalc_param(tp);
+		ca->minrtt = tp->srtt;
+	}
+
+	if (!ca->hybla_en)
+		return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag);
+
+	if (in_flight < tp->snd_cwnd)
+		return;
+
+	if (ca->rho == 0)
+		hybla_recalc_param(tp);
+
+	rho_fractions = ca->rho_3ls - (ca->rho << 3);
+
+	if (tp->snd_cwnd < tp->snd_ssthresh) {
+		/*
+		 * slow start
+		 *      INC = 2^RHO - 1
+		 * This is done by splitting the rho parameter
+		 * into 2 parts: an integer part and a fraction part.
+		 * Inrement<<7 is estimated by doing:
+		 *	       [2^(int+fract)]<<7
+		 * that is equal to:
+		 *	       (2^int)	*  [(2^fract) <<7]
+		 * 2^int is straightly computed as 1<<int,
+		 * while we will use hybla_slowstart_fraction_increment() to
+		 * calculate 2^fract in a <<7 value.
+		 */
+		is_slowstart = 1;
+		increment = ((1 << ca->rho) * hybla_fraction(rho_fractions))
+			- 128;
+	} else {
+		/*
+		 * congestion avoidance
+		 * INC = RHO^2 / W
+		 * as long as increment is estimated as (rho<<7)/window
+		 * it already is <<7 and we can easily count its fractions.
+		 */
+		increment = ca->rho2_7ls / tp->snd_cwnd;
+		if (increment < 128)
+			tp->snd_cwnd_cnt++;
+	}
+
+	odd = increment % 128;
+	tp->snd_cwnd += increment >> 7;
+	ca->snd_cwnd_cents += odd;
+
+	/* check when fractions goes >=128 and increase cwnd by 1. */
+	while(ca->snd_cwnd_cents >= 128) {
+		tp->snd_cwnd++;
+		ca->snd_cwnd_cents -= 128;
+		tp->snd_cwnd_cnt = 0;
+	}
+
+	/* clamp down slowstart cwnd to ssthresh value. */
+	if (is_slowstart)
+		tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+
+	tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+}
+
+static struct tcp_congestion_ops tcp_hybla = {
+	.init		= hybla_init,
+	.ssthresh	= tcp_reno_ssthresh,
+	.min_cwnd	= tcp_reno_min_cwnd,
+	.cong_avoid	= hybla_cong_avoid,
+	.set_state	= hybla_state,
+
+	.owner		= THIS_MODULE,
+	.name		= "hybla"
+};
+
+static int __init hybla_register(void)
+{
+	BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_hybla);
+}
+
+static void __exit hybla_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_hybla);
+}
+
+module_init(hybla_register);
+module_exit(hybla_unregister);
+
+MODULE_AUTHOR("Daniele Lacamera");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Hybla");
-- 
cgit 


From b87d8561d8667d221b728ccdcb18eb95b16a687b Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:27:19 -0700
Subject: [TCP]: Add TCP Vegas congestion control module.

TCP Vegas code modified for the new TCP infrastructure.
Vegas now uses microsecond resolution timestamps for
better estimation of performance over higher speed links.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig     |  11 ++
 net/ipv4/Makefile    |   1 +
 net/ipv4/tcp_vegas.c | 411 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 423 insertions(+)
 create mode 100644 net/ipv4/tcp_vegas.c

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 516ffe842816..6c105b60cc00 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -488,6 +488,17 @@ config TCP_CONG_HYBLA
 	involved, expecially when sharing a common bottleneck with normal
 	terrestrial connections.
 
+config TCP_CONG_VEGAS
+	tristate "TCP Vegas"
+	depends on INET && EXPERIMENTAL
+	default n
+	---help---
+	TCP Vegas is a sender-side only change to TCP that anticipates
+	the onset of congestion by estimating the bandwidth. TCP Vegas
+	adjusts the sending rate by modifying the congestion
+	window. TCP Vegas should provide less packet loss, but it is
+	not as aggressive as TCP Reno.
+
 endmenu
 
 source "net/ipv4/ipvs/Kconfig"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ede7a5d617ff..a801a97bd011 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
 obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
 obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
+obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
new file mode 100644
index 000000000000..9bd443db5193
--- /dev/null
+++ b/net/ipv4/tcp_vegas.c
@@ -0,0 +1,411 @@
+/*
+ * TCP Vegas congestion control
+ *
+ * This is based on the congestion detection/avoidance scheme described in
+ *    Lawrence S. Brakmo and Larry L. Peterson.
+ *    "TCP Vegas: End to end congestion avoidance on a global internet."
+ *    IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
+ *    October 1995. Available from:
+ *	ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
+ *
+ * See http://www.cs.arizona.edu/xkernel/ for their implementation.
+ * The main aspects that distinguish this implementation from the
+ * Arizona Vegas implementation are:
+ *   o We do not change the loss detection or recovery mechanisms of
+ *     Linux in any way. Linux already recovers from losses quite well,
+ *     using fine-grained timers, NewReno, and FACK.
+ *   o To avoid the performance penalty imposed by increasing cwnd
+ *     only every-other RTT during slow start, we increase during
+ *     every RTT during slow start, just like Reno.
+ *   o Largely to allow continuous cwnd growth during slow start,
+ *     we use the rate at which ACKs come back as the "actual"
+ *     rate, rather than the rate at which data is sent.
+ *   o To speed convergence to the right rate, we set the cwnd
+ *     to achieve the right ("actual") rate when we exit slow start.
+ *   o To filter out the noise caused by delayed ACKs, we use the
+ *     minimum RTT sample observed during the last RTT to calculate
+ *     the actual rate.
+ *   o When the sender re-starts from idle, it waits until it has
+ *     received ACKs for an entire flight of new data before making
+ *     a cwnd adjustment decision. The original Vegas implementation
+ *     assumed senders never went idle.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp_diag.h>
+
+#include <net/tcp.h>
+
+/* Default values of the Vegas variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+static int alpha = 1<<V_PARAM_SHIFT;
+static int beta  = 3<<V_PARAM_SHIFT;
+static int gamma = 1<<V_PARAM_SHIFT;
+
+module_param(alpha, int, 0644);
+MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)");
+module_param(gamma, int, 0644);
+MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
+
+
+/* Vegas variables */
+struct vegas {
+	u32	beg_snd_nxt;	/* right edge during last RTT */
+	u32	beg_snd_una;	/* left edge  during last RTT */
+	u32	beg_snd_cwnd;	/* saves the size of the cwnd */
+	u8	doing_vegas_now;/* if true, do vegas for this RTT */
+	u16	cntRTT;		/* # of RTTs measured within last RTT */
+	u32	minRTT;		/* min of RTTs measured within last RTT (in usec) */
+	u32	baseRTT;	/* the min of all Vegas RTT measurements seen (in usec) */
+};
+
+/* There are several situations when we must "re-start" Vegas:
+ *
+ *  o when a connection is established
+ *  o after an RTO
+ *  o after fast recovery
+ *  o when we send a packet and there is no outstanding
+ *    unacknowledged data (restarting an idle connection)
+ *
+ * In these circumstances we cannot do a Vegas calculation at the
+ * end of the first RTT, because any calculation we do is using
+ * stale info -- both the saved cwnd and congestion feedback are
+ * stale.
+ *
+ * Instead we must wait until the completion of an RTT during
+ * which we actually receive ACKs.
+ */
+static inline void vegas_enable(struct tcp_sock *tp)
+{
+	struct vegas *vegas = tcp_ca(tp);
+
+	/* Begin taking Vegas samples next time we send something. */
+	vegas->doing_vegas_now = 1;
+
+	/* Set the beginning of the next send window. */
+	vegas->beg_snd_nxt = tp->snd_nxt;
+
+	vegas->cntRTT = 0;
+	vegas->minRTT = 0x7fffffff;
+}
+
+/* Stop taking Vegas samples for now. */
+static inline void vegas_disable(struct tcp_sock *tp)
+{
+	struct vegas *vegas = tcp_ca(tp);
+
+	vegas->doing_vegas_now = 0;
+}
+
+static void tcp_vegas_init(struct tcp_sock *tp)
+{
+	struct vegas *vegas = tcp_ca(tp);
+
+	vegas->baseRTT = 0x7fffffff;
+	vegas_enable(tp);
+}
+
+/* Do RTT sampling needed for Vegas.
+ * Basically we:
+ *   o min-filter RTT samples from within an RTT to get the current
+ *     propagation delay + queuing delay (we are min-filtering to try to
+ *     avoid the effects of delayed ACKs)
+ *   o min-filter RTT samples from a much longer window (forever for now)
+ *     to find the propagation delay (baseRTT)
+ */
+static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
+{
+	struct vegas *vegas = tcp_ca(tp);
+	u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
+
+	/* Filter to find propagation delay: */
+	if (vrtt < vegas->baseRTT)
+		vegas->baseRTT = vrtt;
+
+	/* Find the min RTT during the last RTT to find
+	 * the current prop. delay + queuing delay:
+	 */
+	vegas->minRTT = min(vegas->minRTT, vrtt);
+	vegas->cntRTT++;
+}
+
+static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
+{
+
+	if (ca_state == TCP_CA_Open)
+		vegas_enable(tp);
+	else
+		vegas_disable(tp);
+}
+
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Vegas calculations
+ * until we get fresh RTT samples.  So when we
+ * restart, we reset our Vegas state to a clean
+ * slate. After we get acks for this flight of
+ * packets, _then_ we can make Vegas calculations
+ * again.
+ */
+static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event)
+{
+	if (event == CA_EVENT_CWND_RESTART ||
+	    event == CA_EVENT_TX_START)
+		tcp_vegas_init(tp);
+}
+
+static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
+				 u32 seq_rtt, u32 in_flight, int flag)
+{
+	struct vegas *vegas = tcp_ca(tp);
+
+	if (!vegas->doing_vegas_now)
+		return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag);
+
+	/* The key players are v_beg_snd_una and v_beg_snd_nxt.
+	 *
+	 * These are so named because they represent the approximate values
+	 * of snd_una and snd_nxt at the beginning of the current RTT. More
+	 * precisely, they represent the amount of data sent during the RTT.
+	 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
+	 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
+	 * bytes of data have been ACKed during the course of the RTT, giving
+	 * an "actual" rate of:
+	 *
+	 *     (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
+	 *
+	 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
+	 * because delayed ACKs can cover more than one segment, so they
+	 * don't line up nicely with the boundaries of RTTs.
+	 *
+	 * Another unfortunate fact of life is that delayed ACKs delay the
+	 * advance of the left edge of our send window, so that the number
+	 * of bytes we send in an RTT is often less than our cwnd will allow.
+	 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
+	 */
+
+	if (after(ack, vegas->beg_snd_nxt)) {
+		/* Do the Vegas once-per-RTT cwnd adjustment. */
+		u32 old_wnd, old_snd_cwnd;
+
+
+		/* Here old_wnd is essentially the window of data that was
+		 * sent during the previous RTT, and has all
+		 * been acknowledged in the course of the RTT that ended
+		 * with the ACK we just received. Likewise, old_snd_cwnd
+		 * is the cwnd during the previous RTT.
+		 */
+		old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
+			tp->mss_cache;
+		old_snd_cwnd = vegas->beg_snd_cwnd;
+
+		/* Save the extent of the current window so we can use this
+		 * at the end of the next RTT.
+		 */
+		vegas->beg_snd_una  = vegas->beg_snd_nxt;
+		vegas->beg_snd_nxt  = tp->snd_nxt;
+		vegas->beg_snd_cwnd = tp->snd_cwnd;
+
+		/* Take into account the current RTT sample too, to
+		 * decrease the impact of delayed acks. This double counts
+		 * this sample since we count it for the next window as well,
+		 * but that's not too awful, since we're taking the min,
+		 * rather than averaging.
+		 */
+		tcp_vegas_rtt_calc(tp, seq_rtt*1000);
+
+		/* We do the Vegas calculations only if we got enough RTT
+		 * samples that we can be reasonably sure that we got
+		 * at least one RTT sample that wasn't from a delayed ACK.
+		 * If we only had 2 samples total,
+		 * then that means we're getting only 1 ACK per RTT, which
+		 * means they're almost certainly delayed ACKs.
+		 * If  we have 3 samples, we should be OK.
+		 */
+
+		if (vegas->cntRTT <= 2) {
+			/* We don't have enough RTT samples to do the Vegas
+			 * calculation, so we'll behave like Reno.
+			 */
+			if (tp->snd_cwnd > tp->snd_ssthresh)
+				tp->snd_cwnd++;
+		} else {
+			u32 rtt, target_cwnd, diff;
+
+			/* We have enough RTT samples, so, using the Vegas
+			 * algorithm, we determine if we should increase or
+			 * decrease cwnd, and by how much.
+			 */
+
+			/* Pluck out the RTT we are using for the Vegas
+			 * calculations. This is the min RTT seen during the
+			 * last RTT. Taking the min filters out the effects
+			 * of delayed ACKs, at the cost of noticing congestion
+			 * a bit later.
+			 */
+			rtt = vegas->minRTT;
+
+			/* Calculate the cwnd we should have, if we weren't
+			 * going too fast.
+			 *
+			 * This is:
+			 *     (actual rate in segments) * baseRTT
+			 * We keep it as a fixed point number with
+			 * V_PARAM_SHIFT bits to the right of the binary point.
+			 */
+			target_cwnd = ((old_wnd * vegas->baseRTT)
+				       << V_PARAM_SHIFT) / rtt;
+
+			/* Calculate the difference between the window we had,
+			 * and the window we would like to have. This quantity
+			 * is the "Diff" from the Arizona Vegas papers.
+			 *
+			 * Again, this is a fixed point number with
+			 * V_PARAM_SHIFT bits to the right of the binary
+			 * point.
+			 */
+			diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
+
+			if (tp->snd_cwnd < tp->snd_ssthresh) {
+				/* Slow start.  */
+				if (diff > gamma) {
+					/* Going too fast. Time to slow down
+					 * and switch to congestion avoidance.
+					 */
+					tp->snd_ssthresh = 2;
+
+					/* Set cwnd to match the actual rate
+					 * exactly:
+					 *   cwnd = (actual rate) * baseRTT
+					 * Then we add 1 because the integer
+					 * truncation robs us of full link
+					 * utilization.
+					 */
+					tp->snd_cwnd = min(tp->snd_cwnd,
+							   (target_cwnd >>
+							    V_PARAM_SHIFT)+1);
+
+				}
+			} else {
+				/* Congestion avoidance. */
+				u32 next_snd_cwnd;
+
+				/* Figure out where we would like cwnd
+				 * to be.
+				 */
+				if (diff > beta) {
+					/* The old window was too fast, so
+					 * we slow down.
+					 */
+					next_snd_cwnd = old_snd_cwnd - 1;
+				} else if (diff < alpha) {
+					/* We don't have enough extra packets
+					 * in the network, so speed up.
+					 */
+					next_snd_cwnd = old_snd_cwnd + 1;
+				} else {
+					/* Sending just as fast as we
+					 * should be.
+					 */
+					next_snd_cwnd = old_snd_cwnd;
+				}
+
+				/* Adjust cwnd upward or downward, toward the
+				 * desired value.
+				 */
+				if (next_snd_cwnd > tp->snd_cwnd)
+					tp->snd_cwnd++;
+				else if (next_snd_cwnd < tp->snd_cwnd)
+					tp->snd_cwnd--;
+			}
+		}
+
+		/* Wipe the slate clean for the next RTT. */
+		vegas->cntRTT = 0;
+		vegas->minRTT = 0x7fffffff;
+	}
+
+	/* The following code is executed for every ack we receive,
+	 * except for conditions checked in should_advance_cwnd()
+	 * before the call to tcp_cong_avoid(). Mainly this means that
+	 * we only execute this code if the ack actually acked some
+	 * data.
+	 */
+
+	/* If we are in slow start, increase our cwnd in response to this ACK.
+	 * (If we are not in slow start then we are in congestion avoidance,
+	 * and adjust our congestion window only once per RTT. See the code
+	 * above.)
+	 */
+	if (tp->snd_cwnd <= tp->snd_ssthresh)
+		tp->snd_cwnd++;
+
+	/* to keep cwnd from growing without bound */
+	tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+
+	/* Make sure that we are never so timid as to reduce our cwnd below
+	 * 2 MSS.
+	 *
+	 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
+	 */
+	tp->snd_cwnd = max(tp->snd_cwnd, 2U);
+}
+
+/* Extract info for Tcp socket info provided via netlink. */
+static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext,
+			       struct sk_buff *skb)
+{
+	const struct vegas *ca = tcp_ca(tp);
+	if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
+		struct tcpvegas_info *info;
+
+		info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO,
+					  sizeof(*info)));
+
+		info->tcpv_enabled = ca->doing_vegas_now;
+		info->tcpv_rttcnt = ca->cntRTT;
+		info->tcpv_rtt = ca->baseRTT;
+		info->tcpv_minrtt = ca->minRTT;
+	rtattr_failure:	;
+	}
+}
+
+static struct tcp_congestion_ops tcp_vegas = {
+	.init		= tcp_vegas_init,
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_vegas_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+	.rtt_sample	= tcp_vegas_rtt_calc,
+	.set_state	= tcp_vegas_state,
+	.cwnd_event	= tcp_vegas_cwnd_event,
+	.get_info	= tcp_vegas_get_info,
+
+	.owner		= THIS_MODULE,
+	.name		= "vegas",
+};
+
+static int __init tcp_vegas_register(void)
+{
+	BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE);
+	tcp_register_congestion_control(&tcp_vegas);
+	return 0;
+}
+
+static void __exit tcp_vegas_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_vegas);
+}
+
+module_init(tcp_vegas_register);
+module_exit(tcp_vegas_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Vegas");
-- 
cgit 


From a7868ea68d29eb2c037952aeb3b549cf05749a18 Mon Sep 17 00:00:00 2001
From: Baruch Even <baruch@ev-en.org>
Date: Thu, 23 Jun 2005 12:28:11 -0700
Subject: [TCP]: Add H-TCP congestion control module.

H-TCP is a congestion control algorithm developed at the Hamilton Institute, by
Douglas Leith and Robert Shorten. It is extending the standard Reno algorithm
with mode switching is thus a relatively simple modification.

H-TCP is defined in a layered manner as it is still a research platform. The
basic form includes the modification of beta according to the ratio of maxRTT
to min RTT and the alpha=2*factor*(1-beta) relation, where factor is dependant
on the time since last congestion.

The other layers improve convergence by adding appropriate factors to alpha.

The following patch implements the H-TCP algorithm in it's basic form.

Signed-Off-By: Baruch Even <baruch@ev-en.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig    |  13 +++
 net/ipv4/Makefile   |   1 +
 net/ipv4/tcp_htcp.c | 289 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 303 insertions(+)
 create mode 100644 net/ipv4/tcp_htcp.c

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6c105b60cc00..73a25b52bf7d 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -467,6 +467,18 @@ config TCP_CONG_WESTWOOD
 	TCP Westwood+ significantly increases fairness wrt TCP Reno in
 	wired networks and throughput over wireless links.
 
+config TCP_CONG_HTCP
+        tristate "H-TCP"
+	depends on INET
+        default m
+	---help---
+	H-TCP is a send-side only modifications of the TCP Reno
+	protocol stack that optimizes the performance of TCP
+	congestion control for high speed network links. It uses a
+	modeswitch to change the alpha and beta parameters of TCP Reno
+	based on network conditions and in a way so as to be fair with
+	other Reno and H-TCP flows.
+
 config TCP_CONG_HSTCP
 	tristate "High Speed TCP"
 	depends on INET && EXPERIMENTAL
@@ -499,6 +511,7 @@ config TCP_CONG_VEGAS
 	window. TCP Vegas should provide less packet loss, but it is
 	not as aggressive as TCP Reno.
 
+
 endmenu
 
 source "net/ipv4/ipvs/Kconfig"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index a801a97bd011..e96ed17deb96 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
 obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
 obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
+obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
 obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
new file mode 100644
index 000000000000..40168275acf9
--- /dev/null
+++ b/net/ipv4/tcp_htcp.c
@@ -0,0 +1,289 @@
+/*
+ * H-TCP congestion control. The algorithm is detailed in:
+ * R.N.Shorten, D.J.Leith:
+ *   "H-TCP: TCP for high-speed and long-distance networks"
+ *   Proc. PFLDnet, Argonne, 2004.
+ * http://www.hamilton.ie/net/htcp3.pdf
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+#define ALPHA_BASE	(1<<7)  /* 1.0 with shift << 7 */
+#define BETA_MIN	(1<<6)  /* 0.5 with shift << 7 */
+#define BETA_MAX	102	/* 0.8 with shift << 7 */
+
+static int use_rtt_scaling = 1;
+module_param(use_rtt_scaling, int, 0644);
+MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling");
+
+static int use_bandwidth_switch = 1;
+module_param(use_bandwidth_switch, int, 0644);
+MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher");
+
+struct htcp {
+	u16	alpha;		/* Fixed point arith, << 7 */
+	u8	beta;           /* Fixed point arith, << 7 */
+	u8	modeswitch;     /* Delay modeswitch until we had at least one congestion event */
+	u8	ccount;		/* Number of RTTs since last congestion event */
+	u8	undo_ccount;
+	u16	packetcount;
+	u32	minRTT;
+	u32	maxRTT;
+	u32	snd_cwnd_cnt2;
+
+	u32	undo_maxRTT;
+	u32	undo_old_maxB;
+
+	/* Bandwidth estimation */
+	u32	minB;
+	u32	maxB;
+	u32	old_maxB;
+	u32	Bi;
+	u32	lasttime;
+};
+
+static inline void htcp_reset(struct htcp *ca)
+{
+	ca->undo_ccount = ca->ccount;
+	ca->undo_maxRTT = ca->maxRTT;
+	ca->undo_old_maxB = ca->old_maxB;
+
+	ca->ccount = 0;
+	ca->snd_cwnd_cnt2 = 0;
+}
+
+static u32 htcp_cwnd_undo(struct tcp_sock *tp)
+{
+	struct htcp *ca = tcp_ca(tp);
+	ca->ccount = ca->undo_ccount;
+	ca->maxRTT = ca->undo_maxRTT;
+	ca->old_maxB = ca->undo_old_maxB;
+	return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
+}
+
+static inline void measure_rtt(struct tcp_sock *tp)
+{
+	struct htcp *ca = tcp_ca(tp);
+	u32 srtt = tp->srtt>>3;
+
+	/* keep track of minimum RTT seen so far, minRTT is zero at first */
+	if (ca->minRTT > srtt || !ca->minRTT)
+		ca->minRTT = srtt;
+
+	/* max RTT */
+	if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
+		if (ca->maxRTT < ca->minRTT)
+			ca->maxRTT = ca->minRTT;
+		if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50)
+			ca->maxRTT = srtt;
+	}
+}
+
+static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked)
+{
+	struct htcp *ca = tcp_ca(tp);
+	u32 now = tcp_time_stamp;
+
+	/* achieved throughput calculations */
+	if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) {
+		ca->packetcount = 0;
+		ca->lasttime = now;
+		return;
+	}
+
+	ca->packetcount += pkts_acked;
+
+	if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1)
+			&& now - ca->lasttime >= ca->minRTT
+			&& ca->minRTT > 0) {
+		__u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime);
+		if (ca->ccount <= 3) {
+			/* just after backoff */
+			ca->minB = ca->maxB = ca->Bi = cur_Bi;
+		} else {
+			ca->Bi = (3*ca->Bi + cur_Bi)/4;
+			if (ca->Bi > ca->maxB)
+				ca->maxB = ca->Bi;
+			if (ca->minB > ca->maxB)
+				ca->minB = ca->maxB;
+		}
+		ca->packetcount = 0;
+		ca->lasttime = now;
+	}
+}
+
+static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
+{
+	if (use_bandwidth_switch) {
+		u32 maxB = ca->maxB;
+		u32 old_maxB = ca->old_maxB;
+		ca->old_maxB = ca->maxB;
+
+		if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) {
+			ca->beta = BETA_MIN;
+			ca->modeswitch = 0;
+			return;
+		}
+	}
+
+	if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) {
+		ca->beta = (minRTT<<7)/maxRTT;
+		if (ca->beta < BETA_MIN)
+			ca->beta = BETA_MIN;
+		else if (ca->beta > BETA_MAX)
+			ca->beta = BETA_MAX;
+	} else {
+		ca->beta = BETA_MIN;
+		ca->modeswitch = 1;
+	}
+}
+
+static inline void htcp_alpha_update(struct htcp *ca)
+{
+	u32 minRTT = ca->minRTT;
+	u32 factor = 1;
+	u32 diff = ca->ccount * minRTT; /* time since last backoff */
+
+	if (diff > HZ) {
+		diff -= HZ;
+		factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ;
+	}
+
+	if (use_rtt_scaling && minRTT) {
+		u32 scale = (HZ<<3)/(10*minRTT);
+		scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */
+		factor = (factor<<3)/scale;
+		if (!factor)
+			factor = 1;
+	}
+
+	ca->alpha = 2*factor*((1<<7)-ca->beta);
+	if (!ca->alpha)
+		ca->alpha = ALPHA_BASE;
+}
+
+/* After we have the rtt data to calculate beta, we'd still prefer to wait one
+ * rtt before we adjust our beta to ensure we are working from a consistent
+ * data.
+ *
+ * This function should be called when we hit a congestion event since only at
+ * that point do we really have a real sense of maxRTT (the queues en route
+ * were getting just too full now).
+ */
+static void htcp_param_update(struct tcp_sock *tp)
+{
+	struct htcp *ca = tcp_ca(tp);
+	u32 minRTT = ca->minRTT;
+	u32 maxRTT = ca->maxRTT;
+
+	htcp_beta_update(ca, minRTT, maxRTT);
+	htcp_alpha_update(ca);
+
+	/* add slowly fading memory for maxRTT to accommodate routing changes etc */
+	if (minRTT > 0 && maxRTT > minRTT)
+		ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
+}
+
+static u32 htcp_recalc_ssthresh(struct tcp_sock *tp)
+{
+	struct htcp *ca = tcp_ca(tp);
+	htcp_param_update(tp);
+	return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
+}
+
+static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+			    u32 in_flight, int data_acked)
+{
+	struct htcp *ca = tcp_ca(tp);
+
+	if (in_flight < tp->snd_cwnd)
+		return;
+
+        if (tp->snd_cwnd <= tp->snd_ssthresh) {
+                /* In "safe" area, increase. */
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+			tp->snd_cwnd++;
+	} else {
+		measure_rtt(tp);
+
+		/* keep track of number of round-trip times since last backoff event */
+		if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) {
+			ca->ccount++;
+			ca->snd_cwnd_cnt2 = 0;
+			htcp_alpha_update(ca);
+		}
+
+                /* In dangerous area, increase slowly.
+		 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
+		 */
+		if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) {
+			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+				tp->snd_cwnd++;
+			tp->snd_cwnd_cnt = 0;
+			ca->ccount++;
+		}
+	}
+}
+
+/* Lower bound on congestion window. */
+static u32 htcp_min_cwnd(struct tcp_sock *tp)
+{
+	return tp->snd_ssthresh;
+}
+
+
+static void htcp_init(struct tcp_sock *tp)
+{
+	struct htcp *ca = tcp_ca(tp);
+
+	memset(ca, 0, sizeof(struct htcp));
+	ca->alpha = ALPHA_BASE;
+	ca->beta = BETA_MIN;
+}
+
+static void htcp_state(struct tcp_sock *tp, u8 new_state)
+{
+	switch (new_state) {
+	case TCP_CA_CWR:
+	case TCP_CA_Recovery:
+	case TCP_CA_Loss:
+		htcp_reset(tcp_ca(tp));
+		break;
+	}
+}
+
+static struct tcp_congestion_ops htcp = {
+	.init		= htcp_init,
+	.ssthresh	= htcp_recalc_ssthresh,
+	.min_cwnd	= htcp_min_cwnd,
+	.cong_avoid	= htcp_cong_avoid,
+	.set_state	= htcp_state,
+	.undo_cwnd	= htcp_cwnd_undo,
+	.pkts_acked	= measure_achieved_throughput,
+	.owner		= THIS_MODULE,
+	.name		= "htcp",
+};
+
+static int __init htcp_register(void)
+{
+	BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE);
+	BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
+	if (!use_bandwidth_switch)
+		htcp.pkts_acked = NULL;
+	return tcp_register_congestion_control(&htcp);
+}
+
+static void __exit htcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&htcp);
+}
+
+module_init(htcp_register);
+module_exit(htcp_unregister);
+
+MODULE_AUTHOR("Baruch Even");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("H-TCP");
-- 
cgit 


From 0e57976b6376f7fda6bef8b7dee2a3c8819ec9e9 Mon Sep 17 00:00:00 2001
From: John Heffner <jheffner@psc.edu>
Date: Thu, 23 Jun 2005 12:29:07 -0700
Subject: [TCP]: Add Scalable TCP congestion control module.

This patch implements Tom Kelly's Scalable TCP congestion control algorithm
for the modular framework.

The algorithm has some nice scaling properties, and has been used a fair bit
in research, though is known to have significant fairness issues, so it's not
really suitable for general purpose use.

Signed-off-by: John Heffner <jheffner@psc.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig        |  9 +++++++
 net/ipv4/Makefile       |  1 +
 net/ipv4/tcp_scalable.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+)
 create mode 100644 net/ipv4/tcp_scalable.c

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 73a25b52bf7d..690e88ba2484 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -511,6 +511,15 @@ config TCP_CONG_VEGAS
 	window. TCP Vegas should provide less packet loss, but it is
 	not as aggressive as TCP Reno.
 
+config TCP_CONG_SCALABLE
+	tristate "Scalable TCP"
+	depends on INET && EXPERIMENTAL
+	default n
+	---help---
+	Scalable TCP is a sender-side only change to TCP which uses a
+	MIMD congestion control algorithm which has some nice scaling
+	properties, though is known to have fairness issues.
+	See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/
 
 endmenu
 
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index e96ed17deb96..5718cdb3a61e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
 obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
 obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
 obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
new file mode 100644
index 000000000000..70e108e15c71
--- /dev/null
+++ b/net/ipv4/tcp_scalable.c
@@ -0,0 +1,68 @@
+/* Tom Kelly's Scalable TCP
+ *
+ * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/
+ *
+ * John Heffner <jheffner@sc.edu>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* These factors derived from the recommended values in the aer:
+ * .01 and and 7/8. We use 50 instead of 100 to account for
+ * delayed ack.
+ */
+#define TCP_SCALABLE_AI_CNT	50U
+#define TCP_SCALABLE_MD_SCALE	3
+
+static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+				    u32 in_flight, int flag)
+{
+	if (in_flight < tp->snd_cwnd)
+		return;
+
+	if (tp->snd_cwnd <= tp->snd_ssthresh) {
+		tp->snd_cwnd++;
+	} else {
+		tp->snd_cwnd_cnt++;
+		if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
+			tp->snd_cwnd++;
+			tp->snd_cwnd_cnt = 0;
+		}
+	}
+	tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static u32 tcp_scalable_ssthresh(struct tcp_sock *tp)
+{
+	return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
+}
+
+
+static struct tcp_congestion_ops tcp_scalable = {
+	.ssthresh	= tcp_scalable_ssthresh,
+	.cong_avoid	= tcp_scalable_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+
+	.owner		= THIS_MODULE,
+	.name		= "scalable",
+};
+
+static int __init tcp_scalable_register(void)
+{
+	return tcp_register_congestion_control(&tcp_scalable);
+}
+
+static void __exit tcp_scalable_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_scalable);
+}
+
+module_init(tcp_scalable_register);
+module_exit(tcp_scalable_unregister);
+
+MODULE_AUTHOR("John Heffner");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Scalable TCP");
-- 
cgit 


From c1ebcdb8c422cd73f54bcd2b9953e443a47667e5 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 20:08:59 -0700
Subject: [NET]: Remove obsolete fastroute stats.

Remove last vestiages of fastroute code that is no longer used.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c             | 10 ++--------
 net/core/sysctl_net_core.c |  1 -
 2 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index ab935778ce81..4f1ae2efe872 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2056,14 +2056,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
 
 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
 		   s->total, s->dropped, s->time_squeeze, s->throttled,
-		   s->fastroute_hit, s->fastroute_success, s->fastroute_defer,
-		   s->fastroute_deferred_out,
-#if 0
-		   s->fastroute_latency_reduction
-#else
-		   s->cpu_collision
-#endif
-		  );
+		   0, 0, 0, 0, /* was fastroute */
+		   s->cpu_collision );
 	return 0;
 }
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 880a88815211..76e9987474ca 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -18,7 +18,6 @@ extern int no_cong_thresh;
 extern int no_cong;
 extern int lo_cong;
 extern int mod_cong;
-extern int netdev_fastroute;
 extern int net_msg_cost;
 extern int net_msg_burst;
 
-- 
cgit 


From 34008d8c631d067caffa136313260525f3ae48a2 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 20:10:00 -0700
Subject: [NET]: Remove obsolete netif_rx congestion sensing mechanism.

Remove the congestion sensing mechanism from netif_rx, and always
return either full or empty.  Almost no driver checks the return value
from netif_rx, and those that do only use it for debug messages.

The original design of netif_rx was to do flow control based on the
receive queue, but NAPI has supplanted this and no driver uses the
feedback.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c             | 88 +---------------------------------------------
 net/core/sysctl_net_core.c | 36 -------------------
 2 files changed, 1 insertion(+), 123 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 4f1ae2efe872..3156df699f01 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,18 +115,6 @@
 #endif	/* CONFIG_NET_RADIO */
 #include <asm/current.h>
 
-/* This define, if set, will randomly drop a packet when congestion
- * is more than moderate.  It helps fairness in the multi-interface
- * case when one of them is a hog, but it kills performance for the
- * single interface case so it is off now by default.
- */
-#undef RAND_LIE
-
-/* Setting this will sample the queue lengths and thus congestion
- * via a timer instead of as each packet is received.
- */
-#undef OFFLINE_SAMPLE
-
 /*
  *	The list of packet types we will receive (as opposed to discard)
  *	and the routines to invoke.
@@ -159,11 +147,6 @@ static DEFINE_SPINLOCK(ptype_lock);
 static struct list_head ptype_base[16];	/* 16 way hashed list */
 static struct list_head ptype_all;		/* Taps */
 
-#ifdef OFFLINE_SAMPLE
-static void sample_queue(unsigned long dummy);
-static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
-#endif
-
 /*
  * The @dev_base list is protected by @dev_base_lock and the rtln
  * semaphore.
@@ -1365,69 +1348,10 @@ out:
 
 int netdev_max_backlog = 300;
 int weight_p = 64;            /* old backlog weight */
-/* These numbers are selected based on intuition and some
- * experimentatiom, if you have more scientific way of doing this
- * please go ahead and fix things.
- */
-int no_cong_thresh = 10;
-int no_cong = 20;
-int lo_cong = 100;
-int mod_cong = 290;
 
 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
 
 
-static void get_sample_stats(int cpu)
-{
-#ifdef RAND_LIE
-	unsigned long rd;
-	int rq;
-#endif
-	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
-	int blog = sd->input_pkt_queue.qlen;
-	int avg_blog = sd->avg_blog;
-
-	avg_blog = (avg_blog >> 1) + (blog >> 1);
-
-	if (avg_blog > mod_cong) {
-		/* Above moderate congestion levels. */
-		sd->cng_level = NET_RX_CN_HIGH;
-#ifdef RAND_LIE
-		rd = net_random();
-		rq = rd % netdev_max_backlog;
-		if (rq < avg_blog) /* unlucky bastard */
-			sd->cng_level = NET_RX_DROP;
-#endif
-	} else if (avg_blog > lo_cong) {
-		sd->cng_level = NET_RX_CN_MOD;
-#ifdef RAND_LIE
-		rd = net_random();
-		rq = rd % netdev_max_backlog;
-			if (rq < avg_blog) /* unlucky bastard */
-				sd->cng_level = NET_RX_CN_HIGH;
-#endif
-	} else if (avg_blog > no_cong)
-		sd->cng_level = NET_RX_CN_LOW;
-	else  /* no congestion */
-		sd->cng_level = NET_RX_SUCCESS;
-
-	sd->avg_blog = avg_blog;
-}
-
-#ifdef OFFLINE_SAMPLE
-static void sample_queue(unsigned long dummy)
-{
-/* 10 ms 0r 1ms -- i don't care -- JHS */
-	int next_tick = 1;
-	int cpu = smp_processor_id();
-
-	get_sample_stats(cpu);
-	next_tick += jiffies;
-	mod_timer(&samp_timer, next_tick);
-}
-#endif
-
-
 /**
  *	netif_rx	-	post buffer to the network code
  *	@skb: buffer to post
@@ -1476,11 +1400,8 @@ int netif_rx(struct sk_buff *skb)
 enqueue:
 			dev_hold(skb->dev);
 			__skb_queue_tail(&queue->input_pkt_queue, skb);
-#ifndef OFFLINE_SAMPLE
-			get_sample_stats(this_cpu);
-#endif
 			local_irq_restore(flags);
-			return queue->cng_level;
+			return NET_RX_SUCCESS;
 		}
 
 		if (queue->throttle)
@@ -3300,8 +3221,6 @@ static int __init net_dev_init(void)
 		queue = &per_cpu(softnet_data, i);
 		skb_queue_head_init(&queue->input_pkt_queue);
 		queue->throttle = 0;
-		queue->cng_level = 0;
-		queue->avg_blog = 10; /* arbitrary non-zero */
 		queue->completion_queue = NULL;
 		INIT_LIST_HEAD(&queue->poll_list);
 		set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
@@ -3310,11 +3229,6 @@ static int __init net_dev_init(void)
 		atomic_set(&queue->backlog_dev.refcnt, 1);
 	}
 
-#ifdef OFFLINE_SAMPLE
-	samp_timer.expires = jiffies + (10 * HZ);
-	add_timer(&samp_timer);
-#endif
-
 	dev_boot_phase = 0;
 
 	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 76e9987474ca..fff63643a35c 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -14,10 +14,6 @@
 
 extern int netdev_max_backlog;
 extern int weight_p;
-extern int no_cong_thresh;
-extern int no_cong;
-extern int lo_cong;
-extern int mod_cong;
 extern int net_msg_cost;
 extern int net_msg_burst;
 
@@ -84,38 +80,6 @@ ctl_table core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
-	{
-		.ctl_name	= NET_CORE_NO_CONG_THRESH,
-		.procname	= "no_cong_thresh",
-		.data		= &no_cong_thresh,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
-		.ctl_name	= NET_CORE_NO_CONG,
-		.procname	= "no_cong",
-		.data		= &no_cong,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
-		.ctl_name	= NET_CORE_LO_CONG,
-		.procname	= "lo_cong",
-		.data		= &lo_cong,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
-		.ctl_name	= NET_CORE_MOD_CONG,
-		.procname	= "mod_cong",
-		.data		= &mod_cong,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
 	{
 		.ctl_name	= NET_CORE_MSG_COST,
 		.procname	= "message_cost",
-- 
cgit 


From 31aa02c53c84658f6694f319f09e232ede27be5a Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 20:12:48 -0700
Subject: [NET]: Eliminate netif_rx massive packet drops.

Eliminate the throttling behaviour when the netif receive queue fills
because it behaves badly when using high speed networks under load.
The throttling cause multiple packet drops that cause TCP to go into
slow start mode. The same effective patch has been part of BIC TCP and
H-TCP as well as part of Web100.

The existing code drops 100's of packets when the queue fills;
this changes it to individual packet drop-tail.

Signed-off-by: Stephen Hemmminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 3156df699f01..1a64508e527f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -198,7 +198,7 @@ static struct notifier_block *netdev_chain;
  *	Device drivers call our routines to queue packets here. We empty the
  *	queue in the local softnet handler.
  */
-DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, };
+DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
 
 #ifdef CONFIG_SYSFS
 extern int netdev_sysfs_init(void);
@@ -1372,7 +1372,6 @@ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
 
 int netif_rx(struct sk_buff *skb)
 {
-	int this_cpu;
 	struct softnet_data *queue;
 	unsigned long flags;
 
@@ -1388,15 +1387,11 @@ int netif_rx(struct sk_buff *skb)
 	 * short when CPU is congested, but is still operating.
 	 */
 	local_irq_save(flags);
-	this_cpu = smp_processor_id();
 	queue = &__get_cpu_var(softnet_data);
 
 	__get_cpu_var(netdev_rx_stat).total++;
 	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
 		if (queue->input_pkt_queue.qlen) {
-			if (queue->throttle)
-				goto drop;
-
 enqueue:
 			dev_hold(skb->dev);
 			__skb_queue_tail(&queue->input_pkt_queue, skb);
@@ -1404,19 +1399,10 @@ enqueue:
 			return NET_RX_SUCCESS;
 		}
 
-		if (queue->throttle)
-			queue->throttle = 0;
-
 		netif_rx_schedule(&queue->backlog_dev);
 		goto enqueue;
 	}
 
-	if (!queue->throttle) {
-		queue->throttle = 1;
-		__get_cpu_var(netdev_rx_stat).throttled++;
-	}
-
-drop:
 	__get_cpu_var(netdev_rx_stat).dropped++;
 	local_irq_restore(flags);
 
@@ -1701,8 +1687,6 @@ job_done:
 	smp_mb__before_clear_bit();
 	netif_poll_enable(backlog_dev);
 
-	if (queue->throttle)
-		queue->throttle = 0;
 	local_irq_enable();
 	return 0;
 }
@@ -1976,7 +1960,7 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
 	struct netif_rx_stats *s = v;
 
 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
-		   s->total, s->dropped, s->time_squeeze, s->throttled,
+		   s->total, s->dropped, s->time_squeeze, 0,
 		   0, 0, 0, 0, /* was fastroute */
 		   s->cpu_collision );
 	return 0;
@@ -3220,7 +3204,6 @@ static int __init net_dev_init(void)
 
 		queue = &per_cpu(softnet_data, i);
 		skb_queue_head_init(&queue->input_pkt_queue);
-		queue->throttle = 0;
 		queue->completion_queue = NULL;
 		INIT_LIST_HEAD(&queue->poll_list);
 		set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
-- 
cgit 


From 51b0bdedb8e784d0d969a6b77151911130812400 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 20:14:40 -0700
Subject: [NET]: Separate two usages of netdev_max_backlog.

Separate out the two uses of netdev_max_backlog. One controls the
upper bound on packets processed per softirq, the new name for this is
netdev_budget; the other controls the limit on packets queued via
netif_rx.

Increase the max_backlog default to account for faster processors.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c             | 6 +++---
 net/core/sysctl_net_core.c | 9 +++++++++
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 1a64508e527f..7016e0c36b3d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1346,7 +1346,8 @@ out:
 			Receiver routines
   =======================================================================*/
 
-int netdev_max_backlog = 300;
+int netdev_max_backlog = 1000;
+int netdev_budget = 300;
 int weight_p = 64;            /* old backlog weight */
 
 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
@@ -1695,8 +1696,7 @@ static void net_rx_action(struct softirq_action *h)
 {
 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
 	unsigned long start_time = jiffies;
-	int budget = netdev_max_backlog;
-
+	int budget = netdev_budget;
 	
 	local_irq_disable();
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index fff63643a35c..8f817ad9f546 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -13,6 +13,7 @@
 #ifdef CONFIG_SYSCTL
 
 extern int netdev_max_backlog;
+extern int netdev_budget;
 extern int weight_p;
 extern int net_msg_cost;
 extern int net_msg_burst;
@@ -124,6 +125,14 @@ ctl_table core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+	{
+		.ctl_name	= NET_CORE_BUDGET,
+		.procname	= "netdev_budget",
+		.data		= &netdev_budget,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
 	{ .ctl_name = 0 }
 };
 
-- 
cgit 


From 5f8ef48d240963093451bcf83df89f1a1364f51d Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 20:37:36 -0700
Subject: [TCP]: Allow choosing TCP congestion control via sockopt.

Allow using setsockopt to set TCP congestion control to use on a per
socket basis.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c      | 31 ++++++++++++++++++++++++++++++-
 net/ipv4/tcp_cong.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv4/tcp_ipv4.c |  2 +-
 net/ipv6/tcp_ipv6.c |  2 +-
 4 files changed, 76 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f3dbc8dc1263..882436da9a3a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1927,6 +1927,25 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		return tp->af_specific->setsockopt(sk, level, optname,
 						   optval, optlen);
 
+	/* This is a string value all the others are int's */
+	if (optname == TCP_CONGESTION) {
+		char name[TCP_CA_NAME_MAX];
+
+		if (optlen < 1)
+			return -EINVAL;
+
+		val = strncpy_from_user(name, optval,
+					min(TCP_CA_NAME_MAX-1, optlen));
+		if (val < 0)
+			return -EFAULT;
+		name[val] = 0;
+
+		lock_sock(sk);
+		err = tcp_set_congestion_control(tp, name);
+		release_sock(sk);
+		return err;
+	}
+
 	if (optlen < sizeof(int))
 		return -EINVAL;
 
@@ -2211,6 +2230,16 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 	case TCP_QUICKACK:
 		val = !tp->ack.pingpong;
 		break;
+
+	case TCP_CONGESTION:
+		if (get_user(len, optlen))
+			return -EFAULT;
+		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
+		if (put_user(len, optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, tp->ca_ops->name, len))
+			return -EFAULT;
+		return 0;
 	default:
 		return -ENOPROTOOPT;
 	};
@@ -2224,7 +2253,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 
 
 extern void __skb_cb_too_small_for_tcp(int, int);
-extern void tcpdiag_init(void);
+extern struct tcp_congestion_ops tcp_reno;
 
 static __initdata unsigned long thash_entries;
 static int __init set_thash_entries(char *str)
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 665394a63ae4..4970d10a7785 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -21,7 +21,7 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
 {
 	struct tcp_congestion_ops *e;
 
-	list_for_each_entry(e, &tcp_cong_list, list) {
+	list_for_each_entry_rcu(e, &tcp_cong_list, list) {
 		if (strcmp(e->name, name) == 0)
 			return e;
 	}
@@ -77,6 +77,9 @@ void tcp_init_congestion_control(struct tcp_sock *tp)
 {
 	struct tcp_congestion_ops *ca;
 
+	if (tp->ca_ops != &tcp_init_congestion_ops)
+		return;
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
 		if (try_module_get(ca->owner)) {
@@ -139,6 +142,34 @@ void tcp_get_default_congestion_control(char *name)
 	rcu_read_unlock();
 }
 
+/* Change congestion control for socket */
+int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
+{
+	struct tcp_congestion_ops *ca;
+	int err = 0;
+
+	rcu_read_lock();
+	ca = tcp_ca_find(name);
+	if (ca == tp->ca_ops)
+		goto out;
+
+	if (!ca)
+		err = -ENOENT;
+
+	else if (!try_module_get(ca->owner))
+		err = -EBUSY;
+
+	else {
+		tcp_cleanup_congestion_control(tp);
+		tp->ca_ops = ca;
+		if (tp->ca_ops->init)
+			tp->ca_ops->init(tp);
+	}
+ out:
+	rcu_read_unlock();
+	return err;
+}
+
 /*
  * TCP Reno congestion control
  * This is special case used for fallback as well.
@@ -192,4 +223,15 @@ struct tcp_congestion_ops tcp_reno = {
 	.min_cwnd	= tcp_reno_min_cwnd,
 };
 
-EXPORT_SYMBOL_GPL(tcp_reno);
+/* Initial congestion control used (until SYN)
+ * really reno under another name so we can tell difference
+ * during tcp_set_default_congestion_control
+ */
+struct tcp_congestion_ops tcp_init_congestion_ops  = {
+	.name		= "",
+	.owner		= THIS_MODULE,
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+};
+EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 9122814c13ad..ebf112347a97 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2048,7 +2048,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	tp->mss_cache_std = tp->mss_cache = 536;
 
 	tp->reordering = sysctl_tcp_reordering;
-	tp->ca_ops = &tcp_reno;
+	tp->ca_ops = &tcp_init_congestion_ops;
 
 	sk->sk_state = TCP_CLOSE;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index fce56039b0e9..9dac7fdf4726 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2025,7 +2025,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 	sk->sk_state = TCP_CLOSE;
 
 	tp->af_specific = &ipv6_specific;
-	tp->ca_ops = &tcp_reno;
+	tp->ca_ops = &tcp_init_congestion_ops;
 	sk->sk_write_space = sk_stream_write_space;
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 
-- 
cgit 


From 677e90eda3bd8cfde0b748daaa46476162a03950 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 23 Jun 2005 20:59:51 -0700
Subject: [NET]: Zerocopy sequential reading of skb data

Implements sequential reading for both linear and non-linear
skb data at zerocopy cost. The data is returned in chunks of
arbitary length, therefore random access is not possible.

Usage:
	from	 := 0
	to	 := 128
	state	 := undef
	data	 := undef
	len	 := undef
	consumed := 0

	skb_prepare_seq_read(skb, from, to, &state)
	while (len = skb_seq_read(consumed, &data, &state)) != 0 do
		/* do something with 'data' of length 'len' */
		if abort then
			/* abort read if we don't wait for
			 * skb_seq_read() to return 0 */
			skb_abort_seq_read(&state)
			return
		endif
		/* not necessary to consume all of 'len' */
		consumed += len
	done

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6d68c03bc051..d285f2f7e812 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1500,6 +1500,120 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
 		skb_split_no_header(skb, skb1, len, pos);
 }
 
+/**
+ * skb_prepare_seq_read - Prepare a sequential read of skb data
+ * @skb: the buffer to read
+ * @from: lower offset of data to be read
+ * @to: upper offset of data to be read
+ * @st: state variable
+ *
+ * Initializes the specified state variable. Must be called before
+ * invoking skb_seq_read() for the first time.
+ */
+void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
+			  unsigned int to, struct skb_seq_state *st)
+{
+	st->lower_offset = from;
+	st->upper_offset = to;
+	st->root_skb = st->cur_skb = skb;
+	st->frag_idx = st->stepped_offset = 0;
+	st->frag_data = NULL;
+}
+
+/**
+ * skb_seq_read - Sequentially read skb data
+ * @consumed: number of bytes consumed by the caller so far
+ * @data: destination pointer for data to be returned
+ * @st: state variable
+ *
+ * Reads a block of skb data at &consumed relative to the
+ * lower offset specified to skb_prepare_seq_read(). Assigns
+ * the head of the data block to &data and returns the length
+ * of the block or 0 if the end of the skb data or the upper
+ * offset has been reached.
+ *
+ * The caller is not required to consume all of the data
+ * returned, i.e. &consumed is typically set to the number
+ * of bytes already consumed and the next call to
+ * skb_seq_read() will return the remaining part of the block.
+ *
+ * Note: The size of each block of data returned can be arbitary,
+ *       this limitation is the cost for zerocopy seqeuental
+ *       reads of potentially non linear data.
+ *
+ * Note: Fragment lists within fragments are not implemented
+ *       at the moment, state->root_skb could be replaced with
+ *       a stack for this purpose.
+ */
+unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
+			  struct skb_seq_state *st)
+{
+	unsigned int block_limit, abs_offset = consumed + st->lower_offset;
+	skb_frag_t *frag;
+
+	if (unlikely(abs_offset >= st->upper_offset))
+		return 0;
+
+next_skb:
+	block_limit = skb_headlen(st->cur_skb);
+
+	if (abs_offset < block_limit) {
+		*data = st->cur_skb->data + abs_offset;
+		return block_limit - abs_offset;
+	}
+
+	if (st->frag_idx == 0 && !st->frag_data)
+		st->stepped_offset += skb_headlen(st->cur_skb);
+
+	while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
+		frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
+		block_limit = frag->size + st->stepped_offset;
+
+		if (abs_offset < block_limit) {
+			if (!st->frag_data)
+				st->frag_data = kmap_skb_frag(frag);
+
+			*data = (u8 *) st->frag_data + frag->page_offset +
+				(abs_offset - st->stepped_offset);
+
+			return block_limit - abs_offset;
+		}
+
+		if (st->frag_data) {
+			kunmap_skb_frag(st->frag_data);
+			st->frag_data = NULL;
+		}
+
+		st->frag_idx++;
+		st->stepped_offset += frag->size;
+	}
+
+	if (st->cur_skb->next) {
+		st->cur_skb = st->cur_skb->next;
+		st->frag_idx = 0;
+		goto next_skb;
+	} else if (st->root_skb == st->cur_skb &&
+		   skb_shinfo(st->root_skb)->frag_list) {
+		st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
+		goto next_skb;
+	}
+
+	return 0;
+}
+
+/**
+ * skb_abort_seq_read - Abort a sequential read of skb data
+ * @st: state variable
+ *
+ * Must be called if skb_seq_read() was not called until it
+ * returned 0.
+ */
+void skb_abort_seq_read(struct skb_seq_state *st)
+{
+	if (st->frag_data)
+		kunmap_skb_frag(st->frag_data);
+}
+
 void __init skb_init(void)
 {
 	skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
@@ -1538,3 +1652,6 @@ EXPORT_SYMBOL(skb_queue_tail);
 EXPORT_SYMBOL(skb_unlink);
 EXPORT_SYMBOL(skb_append);
 EXPORT_SYMBOL(skb_split);
+EXPORT_SYMBOL(skb_prepare_seq_read);
+EXPORT_SYMBOL(skb_seq_read);
+EXPORT_SYMBOL(skb_abort_seq_read);
-- 
cgit 


From 3fc7e8a6d842f72d16d2623b1022814a635ab961 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 23 Jun 2005 21:00:17 -0700
Subject: [NET]: skb_find_text() - Find a text pattern in skb data

Finds a pattern in the skb data according to the specified
textsearch configuration. Use textsearch_next() to retrieve
subsequent occurrences of the pattern. Returns the offset
to the first occurrence or UINT_MAX if no match was found.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d285f2f7e812..bb73b2190ec7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1614,6 +1614,45 @@ void skb_abort_seq_read(struct skb_seq_state *st)
 		kunmap_skb_frag(st->frag_data);
 }
 
+#define TS_SKB_CB(state)	((struct skb_seq_state *) &((state)->cb))
+
+static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
+					  struct ts_config *conf,
+					  struct ts_state *state)
+{
+	return skb_seq_read(offset, text, TS_SKB_CB(state));
+}
+
+static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
+{
+	skb_abort_seq_read(TS_SKB_CB(state));
+}
+
+/**
+ * skb_find_text - Find a text pattern in skb data
+ * @skb: the buffer to look in
+ * @from: search offset
+ * @to: search limit
+ * @config: textsearch configuration
+ * @state: uninitialized textsearch state variable
+ *
+ * Finds a pattern in the skb data according to the specified
+ * textsearch configuration. Use textsearch_next() to retrieve
+ * subsequent occurrences of the pattern. Returns the offset
+ * to the first occurrence or UINT_MAX if no match was found.
+ */
+unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
+			   unsigned int to, struct ts_config *config,
+			   struct ts_state *state)
+{
+	config->get_next_block = skb_ts_get_next_block;
+	config->finish = skb_ts_finish;
+
+	skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
+
+	return textsearch_find(config, state);
+}
+
 void __init skb_init(void)
 {
 	skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
@@ -1655,3 +1694,4 @@ EXPORT_SYMBOL(skb_split);
 EXPORT_SYMBOL(skb_prepare_seq_read);
 EXPORT_SYMBOL(skb_seq_read);
 EXPORT_SYMBOL(skb_abort_seq_read);
+EXPORT_SYMBOL(skb_find_text);
-- 
cgit 


From d675c989ed2d4ba23dff615330b04371aea83534 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 23 Jun 2005 21:00:58 -0700
Subject: [PKT_SCHED]: Packet classification based on textsearch (ematch)

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/Kconfig   |  11 ++++
 net/sched/Makefile  |   1 +
 net/sched/em_text.c | 157 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 169 insertions(+)
 create mode 100644 net/sched/em_text.c

(limited to 'net')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index b22c9beb604d..95d9bc5d8621 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -449,6 +449,17 @@ config NET_EMATCH_META
 	  To compile this code as a module, choose M here: the
 	  module will be called em_meta.
 
+config NET_EMATCH_TEXT
+	tristate "Textsearch"
+	depends on NET_EMATCH
+	---help---
+	  Say Y here if you want to be ablt to classify packets based on
+	  textsearch comparisons. Please select the appropriate textsearch
+	  algorithms in the Library section.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called em_text.
+
 config NET_CLS_ACT
 	bool "Packet ACTION"
 	depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index eb3fe583eba8..8f58cecd6266 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -40,3 +40,4 @@ obj-$(CONFIG_NET_EMATCH_CMP)	+= em_cmp.o
 obj-$(CONFIG_NET_EMATCH_NBYTE)	+= em_nbyte.o
 obj-$(CONFIG_NET_EMATCH_U32)	+= em_u32.o
 obj-$(CONFIG_NET_EMATCH_META)	+= em_meta.o
+obj-$(CONFIG_NET_EMATCH_TEXT)	+= em_text.o
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
new file mode 100644
index 000000000000..873840d8d072
--- /dev/null
+++ b/net/sched/em_text.c
@@ -0,0 +1,157 @@
+/*
+ * net/sched/em_text.c	Textsearch ematch
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/textsearch.h>
+#include <linux/tc_ematch/tc_em_text.h>
+#include <net/pkt_cls.h>
+
+struct text_match
+{
+	u16			from_offset;
+	u16			to_offset;
+	u8			from_layer;
+	u8			to_layer;
+	struct ts_config	*config;
+};
+
+#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data)
+
+static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
+			 struct tcf_pkt_info *info)
+{
+	struct text_match *tm = EM_TEXT_PRIV(m);
+	int from, to;
+	struct ts_state state;
+
+	from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
+	from += tm->from_offset;
+
+	to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
+	to += tm->to_offset;
+
+	return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX;
+}
+
+static int em_text_change(struct tcf_proto *tp, void *data, int len,
+			  struct tcf_ematch *m)
+{
+	struct text_match *tm;
+	struct tcf_em_text *conf = data;
+	struct ts_config *ts_conf;
+	int flags = 0;
+
+	printk("Configuring text: %s from %d:%d to %d:%d len %d\n", conf->algo, conf->from_offset,
+	    conf->from_layer, conf->to_offset, conf->to_layer, conf->pattern_len);
+
+	if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
+		return -EINVAL;
+
+	if (conf->from_layer > conf->to_layer)
+		return -EINVAL;
+
+	if (conf->from_layer == conf->to_layer &&
+	    conf->from_offset > conf->to_offset)
+		return -EINVAL;
+
+retry:
+	ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf),
+				     conf->pattern_len, GFP_KERNEL, flags);
+
+	if (flags & TS_AUTOLOAD)
+		rtnl_lock();
+
+	if (IS_ERR(ts_conf)) {
+		if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) {
+			rtnl_unlock();
+			flags |= TS_AUTOLOAD;
+			goto retry;
+		} else
+			return PTR_ERR(ts_conf);
+	} else if (flags & TS_AUTOLOAD) {
+		textsearch_destroy(ts_conf);
+		return -EAGAIN;
+	}
+
+	tm = kmalloc(sizeof(*tm), GFP_KERNEL);
+	if (tm == NULL) {
+		textsearch_destroy(ts_conf);
+		return -ENOBUFS;
+	}
+
+	tm->from_offset = conf->from_offset;
+	tm->to_offset   = conf->to_offset;
+	tm->from_layer  = conf->from_layer;
+	tm->to_layer    = conf->to_layer;
+	tm->config      = ts_conf;
+
+	m->datalen = sizeof(*tm);
+	m->data = (unsigned long) tm;
+
+	return 0;
+}
+
+static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
+{
+	textsearch_destroy(EM_TEXT_PRIV(m)->config);
+}
+
+static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
+{
+	struct text_match *tm = EM_TEXT_PRIV(m);
+	struct tcf_em_text conf;
+
+	strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
+	conf.from_offset = tm->from_offset;
+	conf.to_offset = tm->to_offset;
+	conf.from_layer = tm->from_layer;
+	conf.to_layer = tm->to_layer;
+	conf.pattern_len = textsearch_get_pattern_len(tm->config);
+	conf.pad = 0;
+
+	RTA_PUT_NOHDR(skb, sizeof(conf), &conf);
+	RTA_APPEND(skb, conf.pattern_len, textsearch_get_pattern(tm->config));
+	return 0;
+
+rtattr_failure:
+	return -1;
+}		
+
+static struct tcf_ematch_ops em_text_ops = {
+	.kind	  = TCF_EM_TEXT,
+	.change	  = em_text_change,
+	.match	  = em_text_match,
+	.destroy  = em_text_destroy,
+	.dump	  = em_text_dump,
+	.owner	  = THIS_MODULE,
+	.link	  = LIST_HEAD_INIT(em_text_ops.link)
+};
+
+static int __init init_em_text(void)
+{
+	return tcf_em_register(&em_text_ops);
+}
+
+static void __exit exit_em_text(void) 
+{
+	tcf_em_unregister(&em_text_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_text);
+module_exit(exit_em_text);
-- 
cgit 


From f2d368fa3ef90f2159d9e542303901ebf68144dd Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 23 Jun 2005 23:55:41 -0700
Subject: [PKT_SCHED]: Make NET_EMATCH_TEXT select TEXTSEARCh

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 95d9bc5d8621..447b89e556b1 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -452,6 +452,7 @@ config NET_EMATCH_META
 config NET_EMATCH_TEXT
 	tristate "Textsearch"
 	depends on NET_EMATCH
+	select TEXTSEARCH
 	---help---
 	  Say Y here if you want to be ablt to classify packets based on
 	  textsearch comparisons. Please select the appropriate textsearch
-- 
cgit 


From 5ba266d6323e957588712f6a7d31252cd6b797bb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 23 Jun 2005 22:03:15 -0700
Subject: [PATCH] knfsd: nfsd4: fix probe_callback

rpc_create_client was modified recently to do its own (synchronous) NULL ping
of the server.  We'd rather do that on our own, asynchronously, so that we
don't have to block the nfsd thread doing the probe, and so that setclientid
handling (hence, client mounts) can proceed normally whether the callback is
succesful or not.  (We can still function fine without the callback
channel--we just won't be able to give out delegations till it's verified to
work.)

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/sunrpc/sunrpc_syms.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 32e8acbc60fe..62a073495276 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -41,6 +41,7 @@ EXPORT_SYMBOL(rpc_release_task);
 
 /* RPC client functions */
 EXPORT_SYMBOL(rpc_create_client);
+EXPORT_SYMBOL(rpc_new_client);
 EXPORT_SYMBOL(rpc_clone_client);
 EXPORT_SYMBOL(rpc_bind_new_program);
 EXPORT_SYMBOL(rpc_destroy_client);
-- 
cgit 


From 52c1da39534fb382c061de58b65f678ad74b59f5 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 23 Jun 2005 22:05:33 -0700
Subject: [PATCH] make various thing static

Another rollup of patches which give various symbols static scope

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/sctp/sm_statefuns.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 058189684c7c..86073df418f5 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -92,6 +92,17 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep,
 					     sctp_cmd_seq_t *commands);
 static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
 
+static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
+					   __u16 error,
+					   const struct sctp_association *asoc,
+					   struct sctp_transport *transport);
+
+static sctp_disposition_t sctp_sf_violation_chunklen(
+				     const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type,
+				     void *arg,
+				     sctp_cmd_seq_t *commands);
 
 /* Small helper function that checks if the chunk length
  * is of the appropriate length.  The 'required_length' argument
@@ -2328,7 +2339,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_abort(const struct sctp_endpoint *ep,
  *
  * This is common code called by several sctp_sf_*_abort() functions above.
  */
-sctp_disposition_t  sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
+static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
 					   __u16 error,
 					   const struct sctp_association *asoc,
 					   struct sctp_transport *transport)
@@ -3687,7 +3698,8 @@ sctp_disposition_t sctp_sf_violation(const struct sctp_endpoint *ep,
  *
  * Generate an  ABORT chunk and terminate the association.
  */
-sctp_disposition_t sctp_sf_violation_chunklen(const struct sctp_endpoint *ep,
+static sctp_disposition_t sctp_sf_violation_chunklen(
+				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
 				     const sctp_subtype_t type,
 				     void *arg,
-- 
cgit 


From f7704347a74fceaf79c89f8b8dbdd0111013e4d6 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 24 Jun 2005 17:39:03 -0700
Subject: [PKT_SCHED]: Make TEXTSEARCH* options only selected.

Do not present these confusing new options to the user
unless he picked some facility that makes use of it,
such as NET_EMATCH_TEXT.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/Kconfig | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 447b89e556b1..7bac249258e3 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -453,10 +453,11 @@ config NET_EMATCH_TEXT
 	tristate "Textsearch"
 	depends on NET_EMATCH
 	select TEXTSEARCH
+	select TEXTSEARCH_KMP
+	select TEXTSEARCH_FSM
 	---help---
 	  Say Y here if you want to be ablt to classify packets based on
-	  textsearch comparisons. Please select the appropriate textsearch
-	  algorithms in the Library section.
+	  textsearch comparisons.
 
 	  To compile this code as a module, choose M here: the
 	  module will be called em_text.
-- 
cgit 


From bb298ca3ce92574d57c4e49b329421425ea7d279 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 24 Jun 2005 17:50:53 -0700
Subject: [IPV4]: Move FIB lookup algorithm choice under IP_ADVANCED_ROUTING

Most users need not be concerned with a complex choice of what
FIB lookup algorithm to use.  So give them the safe default of
IP_FIB_HASH if IP_ADVANCED_ROUTING is disabled.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig | 64 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 690e88ba2484..7bcfb84126d1 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -1,32 +1,6 @@
 #
 # IP configuration
 #
-choice 
-	prompt "Choose IP: FIB lookup"
-	depends on INET
-	default IP_FIB_HASH
-
-config IP_FIB_HASH
-	bool "FIB_HASH"
-	---help---
-	Current FIB is very proven and good enough for most users.
-
-config IP_FIB_TRIE
-	bool "FIB_TRIE"
-	---help---
-	Use new experimental LC-trie as FIB lookup algoritm. 
-        This improves lookup performance
-	
-	LC-trie is described in:
-	
- 	IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
- 	IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
-	An experimental study of compression methods for dynamic tries
- 	Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- 	http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
-       
-endchoice
-
 config IP_MULTICAST
 	bool "IP: multicasting"
 	depends on INET
@@ -79,6 +53,44 @@ config IP_ADVANCED_ROUTER
 
 	  If unsure, say N here.
 
+choice 
+	prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
+	depends on IP_ADVANCED_ROUTER
+	default IP_FIB_HASH
+
+config IP_FIB_HASH
+	bool "FIB_HASH"
+	---help---
+	Current FIB is very proven and good enough for most users.
+
+config IP_FIB_TRIE
+	bool "FIB_TRIE"
+	---help---
+	Use new experimental LC-trie as FIB lookup algoritm. 
+        This improves lookup performance if you have a large
+	number of routes.
+
+	LC-trie is a longest matching prefix lookup algorithm which
+	performs better than FIB_HASH for large routing tables.
+	But, it consumes more memory and is more complex.
+	
+	LC-trie is described in:
+	
+ 	IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
+ 	IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
+	An experimental study of compression methods for dynamic tries
+ 	Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
+ 	http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+       
+endchoice
+
+# If the user does not enable advanced routing, he gets the safe
+# default of the fib-hash algorithm.
+config IP_FIB_HASH
+	bool
+	depends on !IP_ADVANCED_ROUTER
+	default y
+
 config IP_MULTIPLE_TABLES
 	bool "IP: policy routing"
 	depends on IP_ADVANCED_ROUTER
-- 
cgit 


From a6484045fdd4154f8c8ee8c1dda4e32854c047e0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 24 Jun 2005 18:07:51 -0700
Subject: [TCP]: Do not present confusing congestion control options by
 default.

Create TCP_CONG_ADVANCED option, akin to IP_ADVANCED_ROUTER, which
when disabled will bypass all of the congestion control Kconfig
options and leave the user with a safe default.

That safe default is currently BIC-TCP with new Reno as a fallback.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7bcfb84126d1..347083433120 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -445,9 +445,22 @@ config IP_TCPDIAG
 config IP_TCPDIAG_IPV6
 	def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
 
+config TCP_CONG_ADVANCED
+	bool "TCP: advanced congestion control"
+	depends on INET
+	default y
+	---help---
+	  Support for selection of various TCP congestion control
+	  modules.
+
+	  Nearly all users can safely say no here, and a safe default
+	  selection will be made (BIC-TCP with new Reno as a fallback).
+
+	  If unsure, say N.
+
 # TCP Reno is builtin (required as fallback)
 menu "TCP congestion control"
-	depends on INET
+	depends on TCP_CONG_ADVANCED
 
 config TCP_CONG_BIC
 	tristate "Binary Increase Congestion (BIC) control"
@@ -535,5 +548,10 @@ config TCP_CONG_SCALABLE
 
 endmenu
 
+config TCP_CONG_BIC
+	boolean
+	depends on !TCP_CONG_ADVANCED
+	default y
+
 source "net/ipv4/ipvs/Kconfig"
 
-- 
cgit 


From c54d7e03c3a21b38c587f671704c5a12aa3987fc Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 24 Jun 2005 19:57:07 -0700
Subject: [SUNRPC]: Fix {s,}size_t printf format strings in xprt.c

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sunrpc/xprt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index eca92405948f..269f217918a3 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -970,7 +970,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 		goto out;
 	}
 
-	dprintk("RPC:      XID %08x read %u bytes\n",
+	dprintk("RPC:      XID %08x read %Zd bytes\n",
 			ntohl(xprt->tcp_xid), r);
 	dprintk("RPC:      xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
 			xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
@@ -1006,7 +1006,7 @@ tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
 	desc->count -= len;
 	desc->offset += len;
 	xprt->tcp_offset += len;
-	dprintk("RPC:      discarded %u bytes\n", len);
+	dprintk("RPC:      discarded %Zu bytes\n", len);
 	tcp_check_recm(xprt);
 }
 
-- 
cgit 


From 3e1d1d28d99dabe63c64f7f40f1ca1d646de1f73 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Fri, 24 Jun 2005 23:13:50 -0700
Subject: [PATCH] Cleanup patch for process freezing

1. Establish a simple API for process freezing defined in linux/include/sched.h:

   frozen(process)		Check for frozen process
   freezing(process)		Check if a process is being frozen
   freeze(process)		Tell a process to freeze (go to refrigerator)
   thaw_process(process)	Restart process
   frozen_process(process)	Process is frozen now

2. Remove all references to PF_FREEZE and PF_FROZEN from all
   kernel sources except sched.h

3. Fix numerous locations where try_to_freeze is manually done by a driver

4. Remove the argument that is no longer necessary from two function calls.

5. Some whitespace cleanup

6. Clear potential race in refrigerator (provides an open window of PF_FREEZE
   cleared before setting PF_FROZEN, recalc_sigpending does not check
   PF_FROZEN).

This patch does not address the problem of freeze_processes() violating the rule
that a task may only modify its own flags by setting PF_FREEZE. This is not clean
in an SMP environment. freeze(process) is therefore not SMP safe!

Signed-off-by: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/rxrpc/krxiod.c   | 2 +-
 net/rxrpc/krxsecd.c  | 2 +-
 net/rxrpc/krxtimod.c | 2 +-
 net/sunrpc/svcsock.c | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/krxiod.c b/net/rxrpc/krxiod.c
index 2b537f425a17..dada34a77b21 100644
--- a/net/rxrpc/krxiod.c
+++ b/net/rxrpc/krxiod.c
@@ -138,7 +138,7 @@ static int rxrpc_krxiod(void *arg)
 
 		_debug("### End Work");
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
                 /* discard pending signals */
 		rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c
index 6020c89d9228..1aadd026d354 100644
--- a/net/rxrpc/krxsecd.c
+++ b/net/rxrpc/krxsecd.c
@@ -107,7 +107,7 @@ static int rxrpc_krxsecd(void *arg)
 
 		_debug("### End Inbound Calls");
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
                 /* discard pending signals */
 		rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxtimod.c b/net/rxrpc/krxtimod.c
index 249c2b0290bb..3ac81cdd1211 100644
--- a/net/rxrpc/krxtimod.c
+++ b/net/rxrpc/krxtimod.c
@@ -90,7 +90,7 @@ static int krxtimod(void *arg)
 			complete_and_exit(&krxtimod_dead, 0);
 		}
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
 		/* discard pending signals */
 		rxrpc_discard_my_signals();
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 05907035bc96..56db8f13e6cb 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1185,8 +1185,8 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
 	arg->page_len = (pages-2)*PAGE_SIZE;
 	arg->len = (pages-1)*PAGE_SIZE;
 	arg->tail[0].iov_len = 0;
-	
-	try_to_freeze(PF_FREEZE);
+
+	try_to_freeze();
 	if (signalled())
 		return -EINTR;
 
@@ -1227,7 +1227,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
 
 		schedule_timeout(timeout);
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
 		spin_lock_bh(&serv->sv_lock);
 		remove_wait_queue(&rqstp->rq_wait, &wait);
-- 
cgit 


From 6c3607676c12d77d70cc712310f52fbc6af5895d Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 26 Jun 2005 15:20:20 -0700
Subject: [IPV4]: Fix thinko in TCP_CONG_BIC default.

Since it is tristate when we offer it as a choice, we should
definte it also as tristate when forcing it as the default.
Otherwise kconfig warns.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 347083433120..7342a9afe018 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -549,7 +549,7 @@ config TCP_CONG_SCALABLE
 endmenu
 
 config TCP_CONG_BIC
-	boolean
+	tristate
 	depends on !TCP_CONG_ADVANCED
 	default y
 
-- 
cgit 


From 60fe7403209179fccd6629172c4b36acc69c5db6 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 26 Jun 2005 15:21:15 -0700
Subject: [TCP]: Let TCP_CONG_ADVANCED default to n

It doesn't seem to make much sense to let an "If unsure, say N." option
default to y.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7342a9afe018..3e63123f7bbd 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -448,7 +448,6 @@ config IP_TCPDIAG_IPV6
 config TCP_CONG_ADVANCED
 	bool "TCP: advanced congestion control"
 	depends on INET
-	default y
 	---help---
 	  Support for selection of various TCP congestion control
 	  modules.
-- 
cgit 


From 64053beeb5a5f9cd79903a8c3dd35d1ef0a4685f Mon Sep 17 00:00:00 2001
From: Robert Olsson <robert.olsson@its.uu.se>
Date: Sun, 26 Jun 2005 15:27:10 -0700
Subject: [PKTGEN]: Fix random packet sizes causing panic

Signed-off-by: Robert Olsson <robert.olsson@its.uu.se>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index c57b06bc79f3..975d651312dc 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -151,7 +151,7 @@
 #include <asm/timex.h>
 
 
-#define VERSION  "pktgen v2.61: Packet Generator for packet performance testing.\n"
+#define VERSION  "pktgen v2.62: Packet Generator for packet performance testing.\n"
 
 /* #define PG_DEBUG(a) a */
 #define PG_DEBUG(a) 
@@ -1921,6 +1921,11 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
 	struct iphdr *iph;
         struct pktgen_hdr *pgh = NULL;
         
+	/* Update any of the values, used when we're incrementing various
+	 * fields.
+	 */
+	mod_cur_headers(pkt_dev);
+
 	skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
 	if (!skb) {
 		sprintf(pkt_dev->result, "No memory");
@@ -1934,11 +1939,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
 	iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr));
 	udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
 
-        /* Update any of the values, used when we're incrementing various
-         * fields.
-         */
-        mod_cur_headers(pkt_dev);
-
 	memcpy(eth, pkt_dev->hh, 12);
 	*(u16*)&eth[12] = __constant_htons(ETH_P_IP);
 
@@ -2192,7 +2192,12 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
 	int datalen;
 	struct ipv6hdr *iph;
         struct pktgen_hdr *pgh = NULL;
-        
+
+	/* Update any of the values, used when we're incrementing various
+	 * fields.
+	 */
+	mod_cur_headers(pkt_dev);
+
 	skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
 	if (!skb) {
 		sprintf(pkt_dev->result, "No memory");
@@ -2206,17 +2211,9 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
 	iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr));
 	udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
 
-
-        /* Update any of the values, used when we're incrementing various
-         * fields.
-         */
-	mod_cur_headers(pkt_dev);
-
-	
 	memcpy(eth, pkt_dev->hh, 12);
 	*(u16*)&eth[12] = __constant_htons(ETH_P_IPV6);
-	
-        
+
 	datalen = pkt_dev->cur_pkt_size-14- 
 		sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */
 
-- 
cgit 


From d470e3b483dcf79c16463bc740738dca76a035a9 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 26 Jun 2005 15:31:51 -0700
Subject: [NETLINK]: Fix two socket hashing bugs.

1) netlink_release() should only decrement the hash entry
   count if the socket was actually hashed.

   This was causing hash->entries to underflow, which
   resulting in all kinds of troubles.

   On 64-bit systems, this would cause the following
   conditional to erroneously trigger:

	err = -ENOMEM;
	if (BITS_PER_LONG > 32 && unlikely(hash->entries >= UINT_MAX))
		goto err;

2) netlink_autobind() needs to propagate the error return from
   netlink_insert().  Otherwise, callers will not see the error
   as they should and thus try to operate on a socket with a zero pid,
   which is very bad.

   However, it should not propagate -EBUSY.  If two threads race
   to autobind the socket, that is fine.  This is consistent with the
   autobind behavior in other protocols.

   So bug #1 above, combined with this one, resulted in hangs
   on netlink_sendmsg() calls to the rtnetlink socket.  We'd try
   to do the user sendmsg() with the socket's pid set to zero,
   later we do a socket lookup using that pid (via the value we
   stashed away in NETLINK_CB(skb).pid), but that won't give us the
   user socket, it will give us the rtnetlink socket.  So when we
   try to wake up the receive queue, we dive back into rtnetlink_rcv()
   which tries to recursively take the rtnetlink semaphore.

Thanks to Jakub Jelink for providing backtraces.  Also, thanks to
Herbert Xu for supplying debugging patches to help track this down,
and also finding a mistake in an earlier version of this fix.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 70bcd4744d93..fc456a7aaec3 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -315,8 +315,8 @@ err:
 static void netlink_remove(struct sock *sk)
 {
 	netlink_table_grab();
-	nl_table[sk->sk_protocol].hash.entries--;
-	sk_del_node_init(sk);
+	if (sk_del_node_init(sk))
+		nl_table[sk->sk_protocol].hash.entries--;
 	if (nlk_sk(sk)->groups)
 		__sk_del_bind_node(sk);
 	netlink_table_ungrab();
@@ -429,7 +429,12 @@ retry:
 	err = netlink_insert(sk, pid);
 	if (err == -EADDRINUSE)
 		goto retry;
-	return 0;
+
+	/* If 2 threads race to autobind, that is fine.  */
+	if (err == -EBUSY)
+		err = 0;
+
+	return err;
 }
 
 static inline int netlink_capable(struct socket *sock, unsigned int flag) 
-- 
cgit 


From 4da62fc70d7cbcf8fa606a8c806d9dc8faa0ceae Mon Sep 17 00:00:00 2001
From: pageexec <pageexec@freemail.hu>
Date: Sun, 26 Jun 2005 16:00:19 -0700
Subject: [IPVS]: Fix for overflows

From: <pageexec@freemail.hu>

$subject was fixed in 2.4 already, 2.6 needs it as well.

The impact of the bugs is a kernel stack overflow and privilege escalation
from CAP_NET_ADMIN via the IP_VS_SO_SET_STARTDAEMON/IP_VS_SO_GET_DAEMON
ioctls.  People running with 'root=all caps' (i.e., most users) are not
really affected (there's nothing to escalate), but SELinux and similar
users should take it seriously if they grant CAP_NET_ADMIN to other users.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipvs/ip_vs_ctl.c  | 8 +++++---
 net/ipv4/ipvs/ip_vs_sync.c | 4 ++--
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 218d9701036e..12a82e91d22a 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -2059,7 +2059,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
 	dst->addr = src->addr;
 	dst->port = src->port;
 	dst->fwmark = src->fwmark;
-	strcpy(dst->sched_name, src->scheduler->name);
+	strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
 	dst->flags = src->flags;
 	dst->timeout = src->timeout / HZ;
 	dst->netmask = src->netmask;
@@ -2080,6 +2080,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
 		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
 			if (count >= get->num_services)
 				goto out;
+			memset(&entry, 0, sizeof(entry));
 			ip_vs_copy_service(&entry, svc);
 			if (copy_to_user(&uptr->entrytable[count],
 					 &entry, sizeof(entry))) {
@@ -2094,6 +2095,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
 		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
 			if (count >= get->num_services)
 				goto out;
+			memset(&entry, 0, sizeof(entry));
 			ip_vs_copy_service(&entry, svc);
 			if (copy_to_user(&uptr->entrytable[count],
 					 &entry, sizeof(entry))) {
@@ -2304,12 +2306,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		memset(&d, 0, sizeof(d));
 		if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
 			d[0].state = IP_VS_STATE_MASTER;
-			strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn);
+			strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
 			d[0].syncid = ip_vs_master_syncid;
 		}
 		if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
 			d[1].state = IP_VS_STATE_BACKUP;
-			strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn);
+			strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
 			d[1].syncid = ip_vs_backup_syncid;
 		}
 		if (copy_to_user(user, &d, sizeof(d)) != 0)
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 25c479550a32..574d1f509b46 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -839,10 +839,10 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
 
 	ip_vs_sync_state |= state;
 	if (state == IP_VS_STATE_MASTER) {
-		strcpy(ip_vs_master_mcast_ifn, mcast_ifn);
+		strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, sizeof(ip_vs_master_mcast_ifn));
 		ip_vs_master_syncid = syncid;
 	} else {
-		strcpy(ip_vs_backup_mcast_ifn, mcast_ifn);
+		strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, sizeof(ip_vs_backup_mcast_ifn));
 		ip_vs_backup_syncid = syncid;
 	}
 
-- 
cgit 


From 85c1937b2693a0d4e39bb2644d720ed3703b9830 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 28 Jun 2005 12:39:40 -0700
Subject: [EBTABLES]: Fix thinkos in ebt_log.c

When converting over the skb_header_pointer(), I converted parts of
this module incorrectly.  Kill the 'u' union in ebt_log() and all the
bogus references to it.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/netfilter/ebt_log.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index e4ae34b88925..662975be3d1d 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -61,8 +61,6 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
 {
 	struct ebt_log_info *info = (struct ebt_log_info *)data;
 	char level_string[4] = "< >";
-	union {struct iphdr iph; struct tcpudphdr ports;
-	       struct arphdr arph; struct arppayload arpp;} u;
 
 	level_string[1] = '0' + info->loglevel;
 	spin_lock_bh(&ebt_log_lock);
@@ -88,7 +86,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
 		}
 		printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,",
 		   NIPQUAD(ih->saddr), NIPQUAD(ih->daddr));
-		printk(" IP tos=0x%02X, IP proto=%d", u.iph.tos,
+		printk(" IP tos=0x%02X, IP proto=%d", ih->tos,
 		       ih->protocol);
 		if (ih->protocol == IPPROTO_TCP ||
 		    ih->protocol == IPPROTO_UDP) {
@@ -127,7 +125,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
 		    ah->ar_pln == sizeof(uint32_t)) {
 			struct arppayload _arpp, *ap;
 
-			ap = skb_header_pointer(skb, sizeof(u.arph),
+			ap = skb_header_pointer(skb, sizeof(_arph),
 						sizeof(_arpp), &_arpp);
 			if (ap == NULL) {
 				printk(" INCOMPLETE ARP payload");
-- 
cgit 


From 4095ebf1e641b0f37ee1cd04c903bb85cf4ed25b Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Tue, 28 Jun 2005 12:49:30 -0700
Subject: [NETFILTER]: ipt_CLUSTERIP: fix ARP mangling

This patch adds mangling of ARP requests (in addition to replies),
since ARP caches are made from snooping both requests and replies.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 9cde8c61f525..6706d3a1bc4f 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -30,7 +30,7 @@
 #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
 #include <linux/netfilter_ipv4/ip_conntrack.h>
 
-#define CLUSTERIP_VERSION "0.6"
+#define CLUSTERIP_VERSION "0.7"
 
 #define DEBUG_CLUSTERIP
 
@@ -524,8 +524,9 @@ arp_mangle(unsigned int hook,
 	    || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
 		return NF_ACCEPT;
 
-	/* we only want to mangle arp replies */
-	if (arp->ar_op != htons(ARPOP_REPLY))
+	/* we only want to mangle arp requests and replies */
+	if (arp->ar_op != htons(ARPOP_REPLY)
+	    && arp->ar_op != htons(ARPOP_REQUEST))
 		return NF_ACCEPT;
 
 	payload = (void *)(arp+1);
-- 
cgit 


From b3563c4fbff906991a1b4ef4609f99cca2a0de6a Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 28 Jun 2005 12:54:43 -0700
Subject: [NETLINK]: Clear padding in netlink messages

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e013d836a7ab..879237c378f8 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -126,6 +126,7 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data
 	rta->rta_type = attrtype;
 	rta->rta_len = size;
 	memcpy(RTA_DATA(rta), data, attrlen);
+	memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
 }
 
 size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size)
-- 
cgit 


From 9ef1d4c7c7aca1cd436612b6ca785b726ffb8ed8 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 28 Jun 2005 12:55:30 -0700
Subject: [NETLINK]: Missing initializations in dumped data

Mostly missing initialization of padding fields of 1 or 2 bytes length,
two instances of uninitialized nlmsgerr->msg of 16 bytes length.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c |  6 ++++++
 net/core/rtnetlink.c |  1 +
 net/core/wireless.c  |  1 +
 net/ipv4/ipmr.c      | 10 ++++++++--
 net/ipv6/addrconf.c  |  1 +
 net/sched/act_api.c  | 10 +++++++++-
 net/sched/cls_api.c  |  2 ++
 net/sched/sch_api.c  |  2 ++
 net/sched/sch_cbq.c  |  2 ++
 9 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 851eb927ed97..1beb782ac41b 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1598,6 +1598,8 @@ static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb,
 
 	read_lock_bh(&tbl->lock);
 	ndtmsg->ndtm_family = tbl->family;
+	ndtmsg->ndtm_pad1   = 0;
+	ndtmsg->ndtm_pad2   = 0;
 
 	RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
 	RTA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval);
@@ -1683,6 +1685,8 @@ static int neightbl_fill_param_info(struct neigh_table *tbl,
 
 	read_lock_bh(&tbl->lock);
 	ndtmsg->ndtm_family = tbl->family;
+	ndtmsg->ndtm_pad1   = 0;
+	ndtmsg->ndtm_pad2   = 0;
 	RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
 
 	if (neightbl_fill_parms(skb, parms) < 0)
@@ -1872,6 +1876,8 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n,
 	struct ndmsg *ndm = NLMSG_DATA(nlh);
 
 	ndm->ndm_family	 = n->ops->family;
+	ndm->ndm_pad1    = 0;
+	ndm->ndm_pad2    = 0;
 	ndm->ndm_flags	 = n->flags;
 	ndm->ndm_type	 = n->type;
 	ndm->ndm_ifindex = n->dev->ifindex;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 879237c378f8..4b1bb30e6381 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -189,6 +189,7 @@ static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*r), flags);
 	r = NLMSG_DATA(nlh);
 	r->ifi_family = AF_UNSPEC;
+	r->__ifi_pad = 0;
 	r->ifi_type = dev->type;
 	r->ifi_index = dev->ifindex;
 	r->ifi_flags = dev_get_flags(dev);
diff --git a/net/core/wireless.c b/net/core/wireless.c
index b2fe378dfbf8..3ff5639c0b78 100644
--- a/net/core/wireless.c
+++ b/net/core/wireless.c
@@ -1102,6 +1102,7 @@ static inline int rtnetlink_fill_iwinfo(struct sk_buff *	skb,
 	nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r));
 	r = NLMSG_DATA(nlh);
 	r->ifi_family = AF_UNSPEC;
+	r->__ifi_pad = 0;
 	r->ifi_type = dev->type;
 	r->ifi_index = dev->ifindex;
 	r->ifi_flags = dev->flags;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e4f809a93f47..7833d920bdba 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -297,6 +297,7 @@ static int vif_delete(int vifi)
 static void ipmr_destroy_unres(struct mfc_cache *c)
 {
 	struct sk_buff *skb;
+	struct nlmsgerr *e;
 
 	atomic_dec(&cache_resolve_queue_len);
 
@@ -306,7 +307,9 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
 			nlh->nlmsg_type = NLMSG_ERROR;
 			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
 			skb_trim(skb, nlh->nlmsg_len);
-			((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
+			e = NLMSG_DATA(nlh);
+			e->error = -ETIMEDOUT;
+			memset(&e->msg, 0, sizeof(e->msg));
 			netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
 		} else
 			kfree_skb(skb);
@@ -499,6 +502,7 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void)
 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
 {
 	struct sk_buff *skb;
+	struct nlmsgerr *e;
 
 	/*
 	 *	Play the pending entries through our router
@@ -515,7 +519,9 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
 				nlh->nlmsg_type = NLMSG_ERROR;
 				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
 				skb_trim(skb, nlh->nlmsg_len);
-				((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE;
+				e = NLMSG_DATA(nlh);
+				e->error = -EMSGSIZE;
+				memset(&e->msg, 0, sizeof(e->msg));
 			}
 			err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
 		} else
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a54d4ef3fd35..8140bed78a26 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2923,6 +2923,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
 	r = NLMSG_DATA(nlh);
 	r->ifi_family = AF_INET6;
+	r->__ifi_pad = 0;
 	r->ifi_type = dev->type;
 	r->ifi_index = dev->ifindex;
 	r->ifi_flags = dev_get_flags(dev);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 9594206e6035..249c61936ea0 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -439,6 +439,8 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
 
 	t = NLMSG_DATA(nlh);
 	t->tca_family = AF_UNSPEC;
+	t->tca__pad1 = 0;
+	t->tca__pad2 = 0;
 	
 	x = (struct rtattr*) skb->tail;
 	RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -580,6 +582,8 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid)
 	nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t));
 	t = NLMSG_DATA(nlh);
 	t->tca_family = AF_UNSPEC;
+	t->tca__pad1 = 0;
+	t->tca__pad2 = 0;
 
 	x = (struct rtattr *) skb->tail;
 	RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -687,7 +691,9 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
 	t = NLMSG_DATA(nlh);
 	t->tca_family = AF_UNSPEC;
-	
+	t->tca__pad1 = 0;
+	t->tca__pad2 = 0;
+
 	x = (struct rtattr*) skb->tail;
 	RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
 
@@ -842,6 +848,8 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	                cb->nlh->nlmsg_type, sizeof(*t));
 	t = NLMSG_DATA(nlh);
 	t->tca_family = AF_UNSPEC;
+	t->tca__pad1 = 0;
+	t->tca__pad2 = 0;
 
 	x = (struct rtattr *) skb->tail;
 	RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 1616bf5c9627..3b5714ef4d1a 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -331,6 +331,8 @@ tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh,
 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
 	tcm = NLMSG_DATA(nlh);
 	tcm->tcm_family = AF_UNSPEC;
+	tcm->tcm__pad1 = 0;
+	tcm->tcm__pad1 = 0;
 	tcm->tcm_ifindex = tp->q->dev->ifindex;
 	tcm->tcm_parent = tp->classid;
 	tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 97c1c75d5c78..05e6e0a799da 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -770,6 +770,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
 	tcm = NLMSG_DATA(nlh);
 	tcm->tcm_family = AF_UNSPEC;
+	tcm->tcm__pad1 = 0;
+	tcm->tcm__pad2 = 0;
 	tcm->tcm_ifindex = q->dev->ifindex;
 	tcm->tcm_parent = clid;
 	tcm->tcm_handle = q->handle;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index d43e3b8cbf6a..baeb3111f75e 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1563,6 +1563,8 @@ static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
 
 	if (cl->police) {
 		opt.police = cl->police;
+		opt.__res1 = 0;
+		opt.__res2 = 0;
 		RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
 	}
 	return skb->len;
-- 
cgit 


From 8a47077a0b5aa2649751c46e7a27884e6686ccbf Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 28 Jun 2005 12:56:45 -0700
Subject: [NETLINK]: Missing padding fields in dumped structures

Plug holes with padding fields and initialized them to zero.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c  | 3 +++
 net/sched/cls_rsvp.h | 1 +
 net/sched/sch_cbq.c  | 1 +
 3 files changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8140bed78a26..1b2902d8eb98 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3031,9 +3031,12 @@ static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*pmsg), flags);
 	pmsg = NLMSG_DATA(nlh);
 	pmsg->prefix_family = AF_INET6;
+	pmsg->prefix_pad1 = 0;
+	pmsg->prefix_pad2 = 0;
 	pmsg->prefix_ifindex = idev->dev->ifindex;
 	pmsg->prefix_len = pinfo->prefix_len;
 	pmsg->prefix_type = pinfo->type;
+	pmsg->prefix_pad3 = 0;
 	
 	pmsg->prefix_flags = 0;
 	if (pinfo->onlink)
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 232fb9196810..006168d69376 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -618,6 +618,7 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
 	pinfo.protocol = s->protocol;
 	pinfo.tunnelid = s->tunnelid;
 	pinfo.tunnelhdr = f->tunnelhdr;
+	pinfo.pad = 0;
 	RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
 	if (f->res.classid)
 		RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid);
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index baeb3111f75e..09453f997d8c 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1528,6 +1528,7 @@ static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
 
 	opt.strategy = cl->ovl_strategy;
 	opt.priority2 = cl->priority2+1;
+	opt.pad = 0;
 	opt.penalty = (cl->penalty*1000)/HZ;
 	RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
 	return skb->len;
-- 
cgit 


From ae9cda5d65f3d8a495241cbdcc2d56f721c83cc3 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Tue, 28 Jun 2005 13:00:30 -0700
Subject: [IPV6]: Don't dump temporary addresses twice

Each IPv6 Temporary Address (w/ CONFIG_IPV6_PRIVACY) is dumped twice
to netlink.

Because temporary addresses are listed in idev->addr_list,
there's no need to dump idev->tempaddr separately.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 1b2902d8eb98..77004b9456c0 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2777,7 +2777,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
 		read_lock_bh(&idev->lock);
 		switch (type) {
 		case UNICAST_ADDR:
-			/* unicast address */
+			/* unicast address incl. temp addr */
 			for (ifa = idev->addr_list; ifa;
 			     ifa = ifa->if_next, ip_idx++) {
 				if (ip_idx < s_ip_idx)
@@ -2788,19 +2788,6 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
 				    NLM_F_MULTI)) <= 0)
 					goto done;
 			}
-			/* temp addr */
-#ifdef CONFIG_IPV6_PRIVACY
-			for (ifa = idev->tempaddr_list; ifa; 
-			     ifa = ifa->tmp_next, ip_idx++) {
-				if (ip_idx < s_ip_idx)
-					continue;
-				if ((err = inet6_fill_ifaddr(skb, ifa, 
-				    NETLINK_CB(cb->skb).pid, 
-				    cb->nlh->nlmsg_seq, RTM_NEWADDR,
-				    NLM_F_MULTI)) <= 0) 
-					goto done;
-			}
-#endif
 			break;
 		case MULTICAST_ADDR:
 			/* multicast address */
-- 
cgit 


From 2c2910a401f1ce2ac9136171e7522e731e1a2a8c Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@gmx.de>
Date: Tue, 28 Jun 2005 13:06:23 -0700
Subject: [IPV4]: Snmpv2 Mib IP counter ipInAddrErrors support

I followed Thomas' proposal to see every martian destination as a case
where the ipInAddrErrors counter has to be incremented. There are
two advantages by doing so: (1) The relation between the ipInReceive
counter and all the other ipInXXX counters is more accurate in the
case the RTN_UNICAST code check fails and (2) it makes the code in
ip_route_input_slow easier.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@gmx.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_input.c | 6 +++++-
 net/ipv4/route.c    | 9 +++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index af2ec88bbb2f..c703528e0bcd 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -283,14 +283,18 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
 {
 	struct net_device *dev = skb->dev;
 	struct iphdr *iph = skb->nh.iph;
+	int err;
 
 	/*
 	 *	Initialise the virtual path cache for the packet. It describes
 	 *	how the packet travels inside Linux networking.
 	 */ 
 	if (skb->dst == NULL) {
-		if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
+		if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
+			if (err == -EHOSTUNREACH)
+				IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
 			goto drop; 
+		}
 	}
 
 #ifdef CONFIG_NET_CLS_ROUTE
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 80cf633d9f4a..12a1cf306f67 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1909,7 +1909,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
 	 */
 	if ((err = fib_lookup(&fl, &res)) != 0) {
 		if (!IN_DEV_FORWARD(in_dev))
-			goto e_inval;
+			goto e_hostunreach;
 		goto no_route;
 	}
 	free_res = 1;
@@ -1933,7 +1933,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
 	}
 
 	if (!IN_DEV_FORWARD(in_dev))
-		goto e_inval;
+		goto e_hostunreach;
 	if (res.type != RTN_UNICAST)
 		goto martian_destination;
 
@@ -2025,6 +2025,11 @@ martian_destination:
 			"%u.%u.%u.%u, dev %s\n",
 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
 #endif
+
+e_hostunreach:
+        err = -EHOSTUNREACH;
+        goto done;
+
 e_inval:
 	err = -EINVAL;
 	goto done;
-- 
cgit 


From 7a1af5d7bb94af16b980a53330436b9fadc12435 Mon Sep 17 00:00:00 2001
From: Maxime Bizon <mbizon@freebox.fr>
Date: Tue, 28 Jun 2005 13:21:12 -0700
Subject: [IPV4]: ipconfig.c: fix dhcp timeout behaviour

I think there is a small bug in ipconfig.c in case IPCONFIG_DHCP is set
and dhcp is used.

When a DHCPOFFER is received, ip address is kept until we get DHCPACK.
If no ack is received, ic_dynamic() returns negatively, but leaves the
offered ip address in ic_myaddr.

This makes the main loop in ip_auto_config() break and uses the maybe
incomplete configuration.

Not sure if it's the best way to do, but the following trivial patch
correct this.

Signed-off-by: Maxime Bizon <mbizon@freebox.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipconfig.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f2509034ce72..d2bf8e1930a3 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1149,8 +1149,10 @@ static int __init ic_dynamic(void)
 		ic_rarp_cleanup();
 #endif
 
-	if (!ic_got_reply)
+	if (!ic_got_reply) {
+		ic_myaddr = INADDR_NONE;
 		return -1;
+	}
 
 	printk("IP-Config: Got %s answer from %u.%u.%u.%u, ",
 		((ic_got_reply & IC_RARP) ? "RARP" 
-- 
cgit 


From 2f85a42964dd43fed3a339701db046bee5a8b903 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Tue, 28 Jun 2005 13:24:23 -0700
Subject: [SCTP] Make init & delayed sack timeouts configurable by user.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/endpointola.c | 13 +++++--------
 net/sctp/protocol.c    |  5 ++++-
 net/sctp/sysctl.c      | 13 +++++++++++++
 net/sctp/transport.c   |  1 -
 4 files changed, 22 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 2ec0320fac3b..c44bf4165c6e 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -102,9 +102,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
 	/* Set up the base timeout information.  */
 	ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0;
 	ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
-		SCTP_DEFAULT_TIMEOUT_T1_COOKIE;
+		msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
-		SCTP_DEFAULT_TIMEOUT_T1_INIT;
+		msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] =
 		msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0;
@@ -117,12 +117,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
         ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
 		= 5 * msecs_to_jiffies(sp->rtoinfo.srto_max);
 
-	ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] =
-		SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
-	ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
-		SCTP_DEFAULT_TIMEOUT_SACK;
-	ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
-		sp->autoclose * HZ;
+	ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
+	ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = sctp_sack_timeout;
+	ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ;
 
 	/* Use SCTP specific send buffer space queues.  */
 	ep->sndbuf_policy = sctp_sndbuf_policy;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 5135e1a25d25..e7f37faba7c0 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1050,7 +1050,10 @@ SCTP_STATIC __init int sctp_init(void)
 	sctp_sndbuf_policy		= 0;
 
 	/* HB.interval              - 30 seconds */
-	sctp_hb_interval		= 30 * HZ;
+	sctp_hb_interval		= SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
+
+	/* delayed SACK timeout */
+	sctp_sack_timeout		= SCTP_DEFAULT_TIMEOUT_SACK;
 
 	/* Implementation specific variables. */
 
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 7fc31849312b..dc4893474f18 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -47,6 +47,8 @@
 static ctl_handler sctp_sysctl_jiffies_ms;
 static long rto_timer_min = 1;
 static long rto_timer_max = 86400000; /* One day */
+static long sack_timer_min = 1;
+static long sack_timer_max = 500;
 
 static ctl_table sctp_table[] = {
 	{
@@ -187,6 +189,17 @@ static ctl_table sctp_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+	{
+		.ctl_name	= NET_SCTP_SACK_TIMEOUT,
+		.procname	= "sack_timeout",
+		.data		= &sctp_sack_timeout,
+		.maxlen		= sizeof(long),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_ms_jiffies_minmax,
+		.strategy	= &sctp_sysctl_jiffies_ms,
+		.extra1         = &sack_timer_min,
+		.extra2         = &sack_timer_max,
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 0ec0fde6e6c5..a63b69179607 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -103,7 +103,6 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 
 	/* Set up the heartbeat timer. */
 	init_timer(&peer->hb_timer);
-	peer->hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
 	peer->hb_timer.function = sctp_generate_heartbeat_event;
 	peer->hb_timer.data = (unsigned long)peer;
 
-- 
cgit 


From f835e471b557c45d2e5701ea5215f6e739b4eb39 Mon Sep 17 00:00:00 2001
From: Robert Olsson <robert.olsson@its.uu.se>
Date: Tue, 28 Jun 2005 15:00:39 -0700
Subject: [IPV4]: Broken memory allocation in fib_trie

This should help up the insertion... but the resize is more crucial.
and complex and needs some thinking.

Signed-off-by: Robert Olsson <robert.olsson@its.uu.se>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 56 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 39 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0671569ee6f0..b56e88edf1b3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
  *		2 of the License, or (at your option) any later version.
  */
 
-#define VERSION "0.323"
+#define VERSION "0.324"
 
 #include <linux/config.h>
 #include <asm/uaccess.h>
@@ -341,8 +341,10 @@ static struct leaf *leaf_new(void)
 static struct leaf_info *leaf_info_new(int plen)
 {
 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
-	li->plen = plen;
-	INIT_LIST_HEAD(&li->falh);
+	if(li) {
+		li->plen = plen;
+		INIT_LIST_HEAD(&li->falh);
+	}
 	return li;
 }
 
@@ -879,8 +881,8 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
 	return (struct node*) tn;
 }
 
-static struct list_head *
-fib_insert_node(struct trie *t, u32 key, int plen)
+static  struct list_head *
+fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 {
 	int pos, newpos;
 	struct tnode *tp = NULL, *tn = NULL;
@@ -940,7 +942,6 @@ fib_insert_node(struct trie *t, u32 key, int plen)
 	if(tp && IS_LEAF(tp))
 		BUG();
 
-	t->revision++;
 
 	/* Case 1: n is a leaf. Compare prefixes */
 
@@ -949,8 +950,10 @@ fib_insert_node(struct trie *t, u32 key, int plen)
 		
 		li = leaf_info_new(plen);
 		
-		if(! li) 
-			BUG();
+		if(! li) {
+			*err = -ENOMEM;
+			goto err;
+		}
 
 		fa_head = &li->falh;
 		insert_leaf_info(&l->list, li);
@@ -959,14 +962,19 @@ fib_insert_node(struct trie *t, u32 key, int plen)
 	t->size++;
 	l = leaf_new();
 
-	if(! l) 
-		BUG();
+	if(! l) {
+		*err = -ENOMEM;
+		goto err;
+	}
 
 	l->key = key;
 	li = leaf_info_new(plen);
 
-	if(! li) 
-		BUG();
+	if(! li) {
+		tnode_free((struct tnode *) l);
+		*err = -ENOMEM;
+		goto err;
+	}
 
 	fa_head = &li->falh;
 	insert_leaf_info(&l->list, li);
@@ -1003,9 +1011,14 @@ fib_insert_node(struct trie *t, u32 key, int plen)
 			newpos = 0;
 			tn = tnode_new(key, newpos, 1); /* First tnode */ 
 		}
-		if(!tn) 
-			trie_bug("tnode_pfx_new failed");
 
+		if(!tn) {
+			free_leaf_info(li);
+			tnode_free((struct tnode *) l);
+			*err = -ENOMEM;
+			goto err;
+		}			
+			
 		NODE_SET_PARENT(tn, tp);
 
 		missbit=tkey_extract_bits(key, newpos, 1);
@@ -1027,7 +1040,9 @@ fib_insert_node(struct trie *t, u32 key, int plen)
 	}
 	/* Rebalance the trie */
 	t->trie = trie_rebalance(t, tp);
-done:;
+done:
+	t->revision++;
+err:;
 	return fa_head;
 }
 
@@ -1156,8 +1171,12 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 	 * Insert new entry to the list.
 	 */
 
-	if(!fa_head)
-		fa_head = fib_insert_node(t, key, plen);
+	if(!fa_head) {
+		fa_head = fib_insert_node(t, &err, key, plen);
+		err = 0;
+		if(err) 
+			goto out_free_new_fa;
+	}
 
 	write_lock_bh(&fib_lock);
 
@@ -1170,6 +1189,9 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
 succeeded:
 	return 0;
+
+out_free_new_fa:
+	kmem_cache_free(fn_alias_kmem, new_fa);
 out:
 	fib_release_info(fi);
 err:;	
-- 
cgit 


From fb3d89498d268c8dedc1ab5b15fa64f536564577 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@redhat.com>
Date: Tue, 28 Jun 2005 15:40:02 -0700
Subject: [IPVS]: Close race conditions on ip_vs_conn_tab list modification

In an smp system, it is possible for an connection timer to expire, calling
ip_vs_conn_expire while the connection table is being flushed, before
ct_write_lock_bh is acquired.

Since the list iterator loop in ip_vs_con_flush releases and re-acquires the
spinlock (even though it doesn't re-enable softirqs), it is possible for the
expiration function to modify the connection list, while it is being traversed
in ip_vs_conn_flush.

The result is that the next pointer gets set to NULL, and subsequently
dereferenced, resulting in an oops.

Signed-off-by: Neil Horman <nhorman@redhat.com>
Acked-by: JulianAnastasov
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipvs/ip_vs_conn.c | 25 ++++---------------------
 1 file changed, 4 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index fd6feb5499fe..9f16ab309106 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -548,7 +548,6 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
 {
 	if (del_timer(&cp->timer))
 		mod_timer(&cp->timer, jiffies);
-	__ip_vs_conn_put(cp);
 }
 
 
@@ -764,7 +763,6 @@ void ip_vs_random_dropentry(void)
 {
 	int idx;
 	struct ip_vs_conn *cp;
-	struct ip_vs_conn *ct;
 
 	/*
 	 * Randomly scan 1/32 of the whole table every second
@@ -801,21 +799,12 @@ void ip_vs_random_dropentry(void)
 					continue;
 			}
 
-			/*
-			 * Drop the entry, and drop its ct if not referenced
-			 */
-			atomic_inc(&cp->refcnt);
-			ct_write_unlock(hash);
-
-			if ((ct = cp->control))
-				atomic_inc(&ct->refcnt);
 			IP_VS_DBG(4, "del connection\n");
 			ip_vs_conn_expire_now(cp);
-			if (ct) {
+			if (cp->control) {
 				IP_VS_DBG(4, "del conn template\n");
-				ip_vs_conn_expire_now(ct);
+				ip_vs_conn_expire_now(cp->control);
 			}
-			ct_write_lock(hash);
 		}
 		ct_write_unlock(hash);
 	}
@@ -829,7 +818,6 @@ static void ip_vs_conn_flush(void)
 {
 	int idx;
 	struct ip_vs_conn *cp;
-	struct ip_vs_conn *ct;
 
   flush_again:
 	for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
@@ -839,18 +827,13 @@ static void ip_vs_conn_flush(void)
 		ct_write_lock_bh(idx);
 
 		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
-			atomic_inc(&cp->refcnt);
-			ct_write_unlock(idx);
 
-			if ((ct = cp->control))
-				atomic_inc(&ct->refcnt);
 			IP_VS_DBG(4, "del connection\n");
 			ip_vs_conn_expire_now(cp);
-			if (ct) {
+			if (cp->control) {
 				IP_VS_DBG(4, "del conn template\n");
-				ip_vs_conn_expire_now(ct);
+				ip_vs_conn_expire_now(cp->control);
 			}
-			ct_write_lock(idx);
 		}
 		ct_write_unlock_bh(idx);
 	}
-- 
cgit 


From 7fe40f73d7591b38f129fe6a9c0fa46e0b192d09 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Tue, 28 Jun 2005 15:46:24 -0700
Subject: [IPV6]: remove more unused IPV6_AUTHHDR things.

Remove two more unused IPV6_AUTHHDR option things,
which I failed to remove them last time,
plus, mark IPV6_AUTHHDR obsolete.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_flowlabel.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 0e5f7499debb..b6c73da5ff35 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -244,7 +244,6 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space,
 		opt_space->opt_nflen = 0;
 	}
 	opt_space->dst1opt = fopt->dst1opt;
-	opt_space->auth = fopt->auth;
 	opt_space->opt_flen = fopt->opt_flen;
 	return opt_space;
 }
-- 
cgit 


From ff593c592a5d674822dce31143635b025f6415b2 Mon Sep 17 00:00:00 2001
From: Denis Vlasenko <vda@ilport.com.ua>
Date: Tue, 28 Jun 2005 15:49:06 -0700
Subject: [NET]: Micro optimization in eth_header()

Signed-off-by: Denis Vlasenko <vda@ilport.com.ua>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ethernet/eth.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 6617ea47d365..ab60ea63688e 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -92,10 +92,9 @@ int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
 	 *	Set the source hardware address. 
 	 */
 	 
-	if(saddr)
-		memcpy(eth->h_source,saddr,dev->addr_len);
-	else
-		memcpy(eth->h_source,dev->dev_addr,dev->addr_len);
+	if(!saddr)
+		saddr = dev->dev_addr;
+	memcpy(eth->h_source,saddr,dev->addr_len);
 
 	/*
 	 *	Anyway, the loopback-device should never use this function... 
-- 
cgit 


From 9666dae51013b064e7d77fc36b5cee98dd167ed5 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 28 Jun 2005 16:04:44 -0700
Subject: [NETFILTER]: Fix connection tracking bug in 2.6.12

In 2.6.12 we started dropping the conntrack reference when a packet
leaves the IP layer. This broke connection tracking on a bridge,
because bridge-netfilter defers calling some NF_IP_* hooks to the bridge
layer for locally generated packets going out a bridge, where the
conntrack reference is no longer available. This patch keeps the
reference in this case as a temporary solution, long term we will
remove the defered hook calling. No attempt is made to drop the
reference in the bridge-code when it is no longer needed, tc actions
could already have sent the packet anywhere.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netfilter.c | 2 +-
 net/ipv4/ip_output.c      | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 03ae4edddac3..2d52fee63a8c 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -844,7 +844,7 @@ static unsigned int ip_sabotage_out(unsigned int hook, struct sk_buff **pskb,
 		 * doesn't use the bridge parent of the indev by using
 		 * the BRNF_DONT_TAKE_PARENT mask. */
 		if (hook == NF_IP_FORWARD && nf_bridge->physindev == NULL) {
-			nf_bridge->mask &= BRNF_DONT_TAKE_PARENT;
+			nf_bridge->mask |= BRNF_DONT_TAKE_PARENT;
 			nf_bridge->physindev = (struct net_device *)in;
 		}
 #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ee07aec215a0..6ce5c3292f9f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -188,7 +188,13 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 		skb = skb2;
 	}
 
-	nf_reset(skb);
+#ifdef CONFIG_BRIDGE_NETFILTER
+	/* bridge-netfilter defers calling some IP hooks to the bridge layer
+	 * and still needs the conntrack reference.
+	 */
+	if (skb->nf_bridge == NULL)
+#endif
+		nf_reset(skb);
 
 	if (hh) {
 		int hh_alen;
-- 
cgit 


From 6935d46c2da64aa032a557374c95336e265cd7ef Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 5 Jul 2005 14:08:57 -0700
Subject: [NET]: Remove redundant code in net/core/filter.c

skb_header_pointer handles linear and non-linear data, no need to handle
linear data again.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index f3b88205ace2..e1267b465def 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -168,10 +168,6 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
 		case BPF_LD|BPF_W|BPF_ABS:
 			k = fentry->k;
  load_w:
-			if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) {
-				A = ntohl(*(u32*)&data[k]);
-				continue;
-			}
 			if (k < 0) {
 				u8 *ptr;
 
@@ -194,10 +190,6 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
 		case BPF_LD|BPF_H|BPF_ABS:
 			k = fentry->k;
  load_h:
-			if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) {
-				A = ntohs(*(u16*)&data[k]);
-				continue;
-			}
 			if (k < 0) {
 				u8 *ptr;
 
@@ -220,10 +212,6 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
 		case BPF_LD|BPF_B|BPF_ABS:
 			k = fentry->k;
 load_b:
-			if (k >= 0 && (unsigned int)k < len) {
-				A = data[k];
-				continue;
-			}
 			if (k < 0) {
 				u8 *ptr;
 
-- 
cgit 


From 0b05b2a49e430220876f8faa7e4778dc7497033c Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 5 Jul 2005 14:10:21 -0700
Subject: [NET]: Consolidate common code in net/core/filter.c

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 90 ++++++++++++++++++++-----------------------------------
 1 file changed, 33 insertions(+), 57 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index e1267b465def..3923428a840f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -36,7 +36,7 @@
 #include <linux/filter.h>
 
 /* No hurry in this branch */
-static u8 *load_pointer(struct sk_buff *skb, int k)
+static void *__load_pointer(struct sk_buff *skb, int k)
 {
 	u8 *ptr = NULL;
 
@@ -50,6 +50,18 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
 	return NULL;
 }
 
+static inline void *load_pointer(struct sk_buff *skb, int k,
+                                 unsigned int size, void *buffer)
+{
+	if (k >= 0)
+		return skb_header_pointer(skb, k, size, buffer);
+	else {
+		if (k >= SKF_AD_OFF)
+			return NULL;
+		return __load_pointer(skb, k);
+	}
+}
+
 /**
  *	sk_run_filter	- 	run a filter on a socket
  *	@skb: buffer to run the filter on
@@ -64,15 +76,16 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
  
 int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
 {
-	unsigned char *data = skb->data;
 	/* len is UNSIGNED. Byte wide insns relies only on implicit
 	   type casts to prevent reading arbitrary memory locations.
 	 */
 	unsigned int len = skb->len-skb->data_len;
 	struct sock_filter *fentry;	/* We walk down these */
+	void *ptr;
 	u32 A = 0;	   		/* Accumulator */
 	u32 X = 0;   			/* Index Register */
 	u32 mem[BPF_MEMWORDS];		/* Scratch Memory Store */
+	u32 tmp;
 	int k;
 	int pc;
 
@@ -168,67 +181,28 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
 		case BPF_LD|BPF_W|BPF_ABS:
 			k = fentry->k;
  load_w:
-			if (k < 0) {
-				u8 *ptr;
-
-				if (k >= SKF_AD_OFF)
-					break;
-				ptr = load_pointer(skb, k);
-				if (ptr) {
-					A = ntohl(*(u32*)ptr);
-					continue;
-				}
-			} else {
-				u32 _tmp, *p;
-				p = skb_header_pointer(skb, k, 4, &_tmp);
-				if (p != NULL) {
-					A = ntohl(*p);
-					continue;
-				}
+			ptr = load_pointer(skb, k, 4, &tmp);
+			if (ptr != NULL) {
+				A = ntohl(*(u32 *)ptr);
+				continue;
 			}
 			return 0;
 		case BPF_LD|BPF_H|BPF_ABS:
 			k = fentry->k;
  load_h:
-			if (k < 0) {
-				u8 *ptr;
-
-				if (k >= SKF_AD_OFF)
-					break;
-				ptr = load_pointer(skb, k);
-				if (ptr) {
-					A = ntohs(*(u16*)ptr);
-					continue;
-				}
-			} else {
-				u16 _tmp, *p;
-				p = skb_header_pointer(skb, k, 2, &_tmp);
-				if (p != NULL) {
-					A = ntohs(*p);
-					continue;
-				}
+			ptr = load_pointer(skb, k, 2, &tmp);
+			if (ptr != NULL) {
+				A = ntohs(*(u16 *)ptr);
+				continue;
 			}
 			return 0;
 		case BPF_LD|BPF_B|BPF_ABS:
 			k = fentry->k;
 load_b:
-			if (k < 0) {
-				u8 *ptr;
-
-				if (k >= SKF_AD_OFF)
-					break;
-				ptr = load_pointer(skb, k);
-				if (ptr) {
-					A = *ptr;
-					continue;
-				}
-			} else {
-				u8 _tmp, *p;
-				p = skb_header_pointer(skb, k, 1, &_tmp);
-				if (p != NULL) {
-					A = *p;
-					continue;
-				}
+			ptr = load_pointer(skb, k, 1, &tmp);
+			if (ptr != NULL) {
+				A = *(u8 *)ptr;
+				continue;
 			}
 			return 0;
 		case BPF_LD|BPF_W|BPF_LEN:
@@ -247,10 +221,12 @@ load_b:
 			k = X + fentry->k;
 			goto load_b;
 		case BPF_LDX|BPF_B|BPF_MSH:
-			if (fentry->k >= len)
-				return 0;
-			X = (data[fentry->k] & 0xf) << 2;
-			continue;
+			ptr = load_pointer(skb, fentry->k, 1, &tmp);
+			if (ptr != NULL) {
+				X = (*(u8 *)ptr & 0xf) << 2;
+				continue;
+			}
+			return 0;
 		case BPF_LD|BPF_IMM:
 			A = fentry->k;
 			continue;
-- 
cgit 


From 3154e540e374bbfd62693d95bc8ed51da95efe75 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 5 Jul 2005 14:10:40 -0700
Subject: [NET]: net/core/filter.c: make len cover the entire packet

As suggested by Herbert Xu:

Since we don't require anything to be in the linear packet range
anymore make len cover the entire packet.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 3923428a840f..cd91a24f9720 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -76,10 +76,6 @@ static inline void *load_pointer(struct sk_buff *skb, int k,
  
 int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
 {
-	/* len is UNSIGNED. Byte wide insns relies only on implicit
-	   type casts to prevent reading arbitrary memory locations.
-	 */
-	unsigned int len = skb->len-skb->data_len;
 	struct sock_filter *fentry;	/* We walk down these */
 	void *ptr;
 	u32 A = 0;	   		/* Accumulator */
@@ -206,10 +202,10 @@ load_b:
 			}
 			return 0;
 		case BPF_LD|BPF_W|BPF_LEN:
-			A = len;
+			A = skb->len;
 			continue;
 		case BPF_LDX|BPF_W|BPF_LEN:
-			X = len;
+			X = skb->len;
 			continue;
 		case BPF_LD|BPF_W|BPF_IND:
 			k = X + fentry->k;
-- 
cgit 


From e176fe8954a5239c24afe79b1001ba3c29511963 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 5 Jul 2005 14:12:44 -0700
Subject: [NET]: Remove unused security member in sk_buff

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c     | 2 --
 net/ipv4/ip_output.c  | 1 -
 net/ipv6/ip6_output.c | 1 -
 net/sched/em_meta.c   | 6 ------
 4 files changed, 10 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bb73b2190ec7..733deee24b9f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -357,7 +357,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
 	C(ip_summed);
 	C(priority);
 	C(protocol);
-	C(security);
 	n->destructor = NULL;
 #ifdef CONFIG_NETFILTER
 	C(nfmark);
@@ -422,7 +421,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->pkt_type	= old->pkt_type;
 	new->stamp	= old->stamp;
 	new->destructor = NULL;
-	new->security	= old->security;
 #ifdef CONFIG_NETFILTER
 	new->nfmark	= old->nfmark;
 	new->nfcache	= old->nfcache;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6ce5c3292f9f..1bfa49eda96f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -389,7 +389,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->pkt_type = from->pkt_type;
 	to->priority = from->priority;
 	to->protocol = from->protocol;
-	to->security = from->security;
 	dst_release(to->dst);
 	to->dst = dst_clone(from->dst);
 	to->dev = from->dev;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 06e7cdaeedc5..1f2c2f9e353f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -465,7 +465,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->pkt_type = from->pkt_type;
 	to->priority = from->priority;
 	to->protocol = from->protocol;
-	to->security = from->security;
 	dst_release(to->dst);
 	to->dst = dst_clone(from->dst);
 	to->dev = from->dev;
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 48bb23c2a35a..53d98f8d3d80 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -205,11 +205,6 @@ META_COLLECTOR(int_protocol)
 	dst->value = skb->protocol;
 }
 
-META_COLLECTOR(int_security)
-{
-	dst->value = skb->security;
-}
-
 META_COLLECTOR(int_pkttype)
 {
 	dst->value = skb->pkt_type;
@@ -524,7 +519,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 		[META_ID(REALDEV)]		= META_FUNC(int_realdev),
 		[META_ID(PRIORITY)]		= META_FUNC(int_priority),
 		[META_ID(PROTOCOL)]		= META_FUNC(int_protocol),
-		[META_ID(SECURITY)]		= META_FUNC(int_security),
 		[META_ID(PKTTYPE)]		= META_FUNC(int_pkttype),
 		[META_ID(PKTLEN)]		= META_FUNC(int_pktlen),
 		[META_ID(DATALEN)]		= META_FUNC(int_datalen),
-- 
cgit 


From 3d54b82fdf0ca79608f61448fb8ab92676487645 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 5 Jul 2005 14:15:09 -0700
Subject: [PKT_SCHED]: Cleanup qdisc creation and alignment macros

Adds qdisc_alloc() to share code between qdisc_create()
and qdisc_create_dflt(). Hides the qdisc alignment behind
macros and makes use of them.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c     | 41 +++++++++--------------------------------
 net/sched/sch_generic.c | 35 ++++++++++++++++++++++++-----------
 2 files changed, 33 insertions(+), 43 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 05e6e0a799da..1ef482ba6b36 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -399,10 +399,8 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
 {
 	int err;
 	struct rtattr *kind = tca[TCA_KIND-1];
-	void *p = NULL;
 	struct Qdisc *sch;
 	struct Qdisc_ops *ops;
-	int size;
 
 	ops = qdisc_lookup_ops(kind);
 #ifdef CONFIG_KMOD
@@ -437,43 +435,23 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
 	if (ops == NULL)
 		goto err_out;
 
-	/* ensure that the Qdisc and the private data are 32-byte aligned */
-	size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
-	size += ops->priv_size + QDISC_ALIGN_CONST;
-
-	p = kmalloc(size, GFP_KERNEL);
-	err = -ENOBUFS;
-	if (!p)
+	sch = qdisc_alloc(dev, ops);
+	if (IS_ERR(sch)) {
+		err = PTR_ERR(sch);
 		goto err_out2;
-	memset(p, 0, size);
-	sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
-	                       & ~QDISC_ALIGN_CONST);
-	sch->padded = (char *)sch - (char *)p;
-
-	INIT_LIST_HEAD(&sch->list);
-	skb_queue_head_init(&sch->q);
+	}
 
-	if (handle == TC_H_INGRESS)
+	if (handle == TC_H_INGRESS) {
 		sch->flags |= TCQ_F_INGRESS;
-
-	sch->ops = ops;
-	sch->enqueue = ops->enqueue;
-	sch->dequeue = ops->dequeue;
-	sch->dev = dev;
-	dev_hold(dev);
-	atomic_set(&sch->refcnt, 1);
-	sch->stats_lock = &dev->queue_lock;
-	if (handle == 0) {
+		handle = TC_H_MAKE(TC_H_INGRESS, 0);
+	} else if (handle == 0) {
 		handle = qdisc_alloc_handle(dev);
 		err = -ENOMEM;
 		if (handle == 0)
 			goto err_out3;
 	}
 
-	if (handle == TC_H_INGRESS)
-                sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
-        else
-                sch->handle = handle;
+	sch->handle = handle;
 
 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
 		qdisc_lock_tree(dev);
@@ -489,12 +467,11 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
 	}
 err_out3:
 	dev_put(dev);
+	kfree((char *) sch - sch->padded);
 err_out2:
 	module_put(ops->owner);
 err_out:
 	*errp = err;
-	if (p)
-		kfree(p);
 	return NULL;
 }
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 7683b34dc6a9..73e218e646ac 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -395,24 +395,23 @@ static struct Qdisc_ops pfifo_fast_ops = {
 	.owner		=	THIS_MODULE,
 };
 
-struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
+struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
 {
 	void *p;
 	struct Qdisc *sch;
-	int size;
+	unsigned int size;
+	int err = -ENOBUFS;
 
 	/* ensure that the Qdisc and the private data are 32-byte aligned */
-	size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
-	size += ops->priv_size + QDISC_ALIGN_CONST;
+	size = QDISC_ALIGN(sizeof(*sch));
+	size += ops->priv_size + (QDISC_ALIGNTO - 1);
 
 	p = kmalloc(size, GFP_KERNEL);
 	if (!p)
-		return NULL;
+		goto errout;
 	memset(p, 0, size);
-
-	sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) 
-			       & ~QDISC_ALIGN_CONST);
-	sch->padded = (char *)sch - (char *)p;
+	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
+	sch->padded = (char *) sch - (char *) p;
 
 	INIT_LIST_HEAD(&sch->list);
 	skb_queue_head_init(&sch->q);
@@ -423,11 +422,24 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
 	dev_hold(dev);
 	sch->stats_lock = &dev->queue_lock;
 	atomic_set(&sch->refcnt, 1);
+
+	return sch;
+errout:
+	return ERR_PTR(-err);
+}
+
+struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
+{
+	struct Qdisc *sch;
+	
+	sch = qdisc_alloc(dev, ops);
+	if (IS_ERR(sch))
+		goto errout;
+
 	if (!ops->init || ops->init(sch, NULL) == 0)
 		return sch;
 
-	dev_put(dev);
-	kfree(p);
+errout:
 	return NULL;
 }
 
@@ -591,6 +603,7 @@ EXPORT_SYMBOL(__netdev_watchdog_up);
 EXPORT_SYMBOL(noop_qdisc);
 EXPORT_SYMBOL(noop_qdisc_ops);
 EXPORT_SYMBOL(qdisc_create_dflt);
+EXPORT_SYMBOL(qdisc_alloc);
 EXPORT_SYMBOL(qdisc_destroy);
 EXPORT_SYMBOL(qdisc_reset);
 EXPORT_SYMBOL(qdisc_restart);
-- 
cgit 


From 023e09a767a89bf1b8646307410852d93fd72f00 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 5 Jul 2005 14:15:53 -0700
Subject: [PKT_SCHED]: Report rate estimator configuration errors during qdisc
 allocation

Current behaviour is to not report an error if a rate
estimator is created together with a qdisc and the
configuration of the rate estimator is bogus. This leads
to unexpected behaviour because the user is not notified.

New behaviour is to report the error and let the whole
qdisc creation operation fail so the user is able to fix
his mistake.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 1ef482ba6b36..b9a069af4a02 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -454,15 +454,27 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
 	sch->handle = handle;
 
 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
+#ifdef CONFIG_NET_ESTIMATOR
+		if (tca[TCA_RATE-1]) {
+			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
+						sch->stats_lock,
+						tca[TCA_RATE-1]);
+			if (err) {
+				/*
+				 * Any broken qdiscs that would require
+				 * a ops->reset() here? The qdisc was never
+				 * in action so it shouldn't be necessary.
+				 */
+				if (ops->destroy)
+					ops->destroy(sch);
+				goto err_out3;
+			}
+		}
+#endif
 		qdisc_lock_tree(dev);
 		list_add_tail(&sch->list, &dev->qdisc_list);
 		qdisc_unlock_tree(dev);
 
-#ifdef CONFIG_NET_ESTIMATOR
-		if (tca[TCA_RATE-1])
-			gen_new_estimator(&sch->bstats, &sch->rate_est,
-				sch->stats_lock, tca[TCA_RATE-1]);
-#endif
 		return sch;
 	}
 err_out3:
-- 
cgit 


From 30e224d76f34e041c30df66a4dcbeeb53556ea3f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 5 Jul 2005 14:40:10 -0700
Subject: [IPV4]: Fix crash in ip_rcv while booting related to netconsole

Makes IPv4 ip_rcv registration happen last in af_inet.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c   | 11 +++++++++++
 net/ipv4/ip_output.c | 15 ---------------
 2 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 658e7977924d..ef7468376ae6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1009,6 +1009,15 @@ static int __init init_ipv4_mibs(void)
 static int ipv4_proc_init(void);
 extern void ipfrag_init(void);
 
+/*
+ *	IP protocol layer initialiser
+ */
+
+static struct packet_type ip_packet_type = {
+	.type = __constant_htons(ETH_P_IP),
+	.func = ip_rcv,
+};
+
 static int __init inet_init(void)
 {
 	struct sk_buff *dummy_skb;
@@ -1102,6 +1111,8 @@ static int __init inet_init(void)
 
 	ipfrag_init();
 
+	dev_add_pack(&ip_packet_type);
+
 	rc = 0;
 out:
 	return rc;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 1bfa49eda96f..9de83e6e0f1d 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1328,23 +1328,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
 	ip_rt_put(rt);
 }
 
-/*
- *	IP protocol layer initialiser
- */
-
-static struct packet_type ip_packet_type = {
-	.type = __constant_htons(ETH_P_IP),
-	.func = ip_rcv,
-};
-
-/*
- *	IP registers the packet type and then calls the subprotocol initialisers
- */
-
 void __init ip_init(void)
 {
-	dev_add_pack(&ip_packet_type);
-
 	ip_rt_init();
 	inet_initpeers();
 
-- 
cgit 


From e2ed4052aa662e7cfb22a1793b9d8158603be6d7 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 5 Jul 2005 14:41:20 -0700
Subject: [IPV6]: Makes IPv6 rcv registration happen last during
 initialisation.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/af_inet6.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2b193e3df49a..28d9bcab0970 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -774,7 +774,6 @@ static int __init inet6_init(void)
 	if (if6_proc_init())
 		goto proc_if6_fail;
 #endif
-	ipv6_packet_init();
 	ip6_route_init();
 	ip6_flowlabel_init();
 	err = addrconf_init();
@@ -791,6 +790,8 @@ static int __init inet6_init(void)
 	/* Init v6 transport protocols. */
 	udpv6_init();
 	tcpv6_init();
+
+	ipv6_packet_init();
 	err = 0;
 out:
 	return err;
@@ -798,7 +799,6 @@ out:
 addrconf_fail:
 	ip6_flowlabel_cleanup();
 	ip6_route_cleanup();
-	ipv6_packet_cleanup();
 #ifdef CONFIG_PROC_FS
 	if6_proc_exit();
 proc_if6_fail:
-- 
cgit 


From f0e36f8cee8101604378085171c980d9cc71d779 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 5 Jul 2005 14:44:55 -0700
Subject: [IPV4]: Handle large allocations in fib_trie

Inflating a node a couple of times makes it exceed the 128k kmalloc limit.
Use __get_free_pages for allocations > PAGE_SIZE, as in fib_hash.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Acked-by: Robert Olsson <Robert.Olsson@data.slu.se>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index b56e88edf1b3..9038b914b4b1 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -358,11 +358,32 @@ static inline void free_leaf_info(struct leaf_info *li)
 	kfree(li);
 }
 
+static struct tnode *tnode_alloc(unsigned int size)
+{
+	if (size <= PAGE_SIZE) {
+		return kmalloc(size, GFP_KERNEL);
+	} else {
+		return (struct tnode *)
+		       __get_free_pages(GFP_KERNEL, get_order(size));
+	}
+}
+
+static void __tnode_free(struct tnode *tn)
+{
+	unsigned int size = sizeof(struct tnode) +
+	                    (1<<tn->bits) * sizeof(struct node *);
+
+	if (size <= PAGE_SIZE)
+		kfree(tn);
+	else
+		free_pages((unsigned long)tn, get_order(size));
+}
+
 static struct tnode* tnode_new(t_key key, int pos, int bits)
 {
 	int nchildren = 1<<bits;
 	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
-	struct tnode *tn = kmalloc(sz,  GFP_KERNEL);
+	struct tnode *tn = tnode_alloc(sz);
 
 	if(tn)  {
 		memset(tn, 0, sz);
@@ -390,7 +411,7 @@ static void tnode_free(struct tnode *tn)
 			printk("FL %p \n", tn);
 	}
 	else if(IS_TNODE(tn)) { 
-		kfree(tn);
+		__tnode_free(tn);
 		if(trie_debug > 0 ) 
 			printk("FT %p \n", tn);
 	}
-- 
cgit 


From 22c047ccbc68fa8f3fa57f0e8f906479a062c426 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 5 Jul 2005 14:55:24 -0700
Subject: [NET]: Hashed spinlocks in net/ipv4/route.c

- Locking abstraction
- Spinlocks moved out of rt hash table : Less memory (50%) used by rt
  hash table. it's a win even on UP.
- Sizing of spinlocks table depends on NR_CPUS

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 66 ++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 12a1cf306f67..daf82f8d3c4a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -54,6 +54,7 @@
  *		Marc Boucher	:	routing by fwmark
  *	Robert Olsson		:	Added rt_cache statistics
  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
+ *	Eric Dumazet		:	hashed spinlocks
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -201,8 +202,37 @@ __u8 ip_tos2prio[16] = {
 
 struct rt_hash_bucket {
 	struct rtable	*chain;
-	spinlock_t	lock;
-} __attribute__((__aligned__(8)));
+};
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+/*
+ * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
+ * The size of this table is a power of two and depends on the number of CPUS.
+ */
+#if NR_CPUS >= 32
+#define RT_HASH_LOCK_SZ	4096
+#elif NR_CPUS >= 16
+#define RT_HASH_LOCK_SZ	2048
+#elif NR_CPUS >= 8
+#define RT_HASH_LOCK_SZ	1024
+#elif NR_CPUS >= 4
+#define RT_HASH_LOCK_SZ	512
+#else
+#define RT_HASH_LOCK_SZ	256
+#endif
+
+static spinlock_t	*rt_hash_locks;
+# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
+# define rt_hash_lock_init()	{ \
+		int i; \
+		rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
+		if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
+		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
+			spin_lock_init(&rt_hash_locks[i]); \
+		}
+#else
+# define rt_hash_lock_addr(slot) NULL
+# define rt_hash_lock_init()
+#endif
 
 static struct rt_hash_bucket 	*rt_hash_table;
 static unsigned			rt_hash_mask;
@@ -587,7 +617,7 @@ static void rt_check_expire(unsigned long dummy)
 		i = (i + 1) & rt_hash_mask;
 		rthp = &rt_hash_table[i].chain;
 
-		spin_lock(&rt_hash_table[i].lock);
+		spin_lock(rt_hash_lock_addr(i));
 		while ((rth = *rthp) != NULL) {
 			if (rth->u.dst.expires) {
 				/* Entry is expired even if it is in use */
@@ -620,7 +650,7 @@ static void rt_check_expire(unsigned long dummy)
  			rt_free(rth);
 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 		}
-		spin_unlock(&rt_hash_table[i].lock);
+		spin_unlock(rt_hash_lock_addr(i));
 
 		/* Fallback loop breaker. */
 		if (time_after(jiffies, now))
@@ -643,11 +673,11 @@ static void rt_run_flush(unsigned long dummy)
 	get_random_bytes(&rt_hash_rnd, 4);
 
 	for (i = rt_hash_mask; i >= 0; i--) {
-		spin_lock_bh(&rt_hash_table[i].lock);
+		spin_lock_bh(rt_hash_lock_addr(i));
 		rth = rt_hash_table[i].chain;
 		if (rth)
 			rt_hash_table[i].chain = NULL;
-		spin_unlock_bh(&rt_hash_table[i].lock);
+		spin_unlock_bh(rt_hash_lock_addr(i));
 
 		for (; rth; rth = next) {
 			next = rth->u.rt_next;
@@ -780,7 +810,7 @@ static int rt_garbage_collect(void)
 
 			k = (k + 1) & rt_hash_mask;
 			rthp = &rt_hash_table[k].chain;
-			spin_lock_bh(&rt_hash_table[k].lock);
+			spin_lock_bh(rt_hash_lock_addr(k));
 			while ((rth = *rthp) != NULL) {
 				if (!rt_may_expire(rth, tmo, expire)) {
 					tmo >>= 1;
@@ -812,7 +842,7 @@ static int rt_garbage_collect(void)
 				goal--;
 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 			}
-			spin_unlock_bh(&rt_hash_table[k].lock);
+			spin_unlock_bh(rt_hash_lock_addr(k));
 			if (goal <= 0)
 				break;
 		}
@@ -882,7 +912,7 @@ restart:
 
 	rthp = &rt_hash_table[hash].chain;
 
-	spin_lock_bh(&rt_hash_table[hash].lock);
+	spin_lock_bh(rt_hash_lock_addr(hash));
 	while ((rth = *rthp) != NULL) {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 		if (!(rth->u.dst.flags & DST_BALANCED) &&
@@ -908,7 +938,7 @@ restart:
 			rth->u.dst.__use++;
 			dst_hold(&rth->u.dst);
 			rth->u.dst.lastuse = now;
-			spin_unlock_bh(&rt_hash_table[hash].lock);
+			spin_unlock_bh(rt_hash_lock_addr(hash));
 
 			rt_drop(rt);
 			*rp = rth;
@@ -949,7 +979,7 @@ restart:
 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 		int err = arp_bind_neighbour(&rt->u.dst);
 		if (err) {
-			spin_unlock_bh(&rt_hash_table[hash].lock);
+			spin_unlock_bh(rt_hash_lock_addr(hash));
 
 			if (err != -ENOBUFS) {
 				rt_drop(rt);
@@ -990,7 +1020,7 @@ restart:
 	}
 #endif
 	rt_hash_table[hash].chain = rt;
-	spin_unlock_bh(&rt_hash_table[hash].lock);
+	spin_unlock_bh(rt_hash_lock_addr(hash));
 	*rp = rt;
 	return 0;
 }
@@ -1058,7 +1088,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
 {
 	struct rtable **rthp;
 
-	spin_lock_bh(&rt_hash_table[hash].lock);
+	spin_lock_bh(rt_hash_lock_addr(hash));
 	ip_rt_put(rt);
 	for (rthp = &rt_hash_table[hash].chain; *rthp;
 	     rthp = &(*rthp)->u.rt_next)
@@ -1067,7 +1097,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
 			rt_free(rt);
 			break;
 		}
-	spin_unlock_bh(&rt_hash_table[hash].lock);
+	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -3073,7 +3103,7 @@ __setup("rhash_entries=", set_rhash_entries);
 
 int __init ip_rt_init(void)
 {
-	int i, order, goal, rc = 0;
+	int order, goal, rc = 0;
 
 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
 			     (jiffies ^ (jiffies >> 7)));
@@ -3122,10 +3152,8 @@ int __init ip_rt_init(void)
 		/* NOTHING */;
 
 	rt_hash_mask--;
-	for (i = 0; i <= rt_hash_mask; i++) {
-		spin_lock_init(&rt_hash_table[i].lock);
-		rt_hash_table[i].chain = NULL;
-	}
+	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
+	rt_hash_lock_init();
 
 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
-- 
cgit 


From 424c4b70cc4ff3930ee36a2ef7b204e4d704fd26 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 5 Jul 2005 14:58:19 -0700
Subject: [IPV4]: Use the fancy alloc_large_system_hash() function for route
 hash table

- rt hash table allocated using alloc_large_system_hash() function.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 43 ++++++++++++++++---------------------------
 1 file changed, 16 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index daf82f8d3c4a..9fcbb1b0a8d6 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -71,6 +71,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/bootmem.h>
 #include <linux/string.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
@@ -3103,12 +3104,14 @@ __setup("rhash_entries=", set_rhash_entries);
 
 int __init ip_rt_init(void)
 {
-	int order, goal, rc = 0;
+	int rc = 0;
 
 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
 			     (jiffies ^ (jiffies >> 7)));
 
 #ifdef CONFIG_NET_CLS_ROUTE
+	{
+	int order;
 	for (order = 0;
 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
 		/* NOTHING */;
@@ -3116,6 +3119,7 @@ int __init ip_rt_init(void)
 	if (!ip_rt_acct)
 		panic("IP: failed to allocate ip_rt_acct\n");
 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
+	}
 #endif
 
 	ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
@@ -3126,32 +3130,17 @@ int __init ip_rt_init(void)
 	if (!ipv4_dst_ops.kmem_cachep)
 		panic("IP: failed to allocate ip_dst_cache\n");
 
-	goal = num_physpages >> (26 - PAGE_SHIFT);
-	if (rhash_entries)
-		goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
-	for (order = 0; (1UL << order) < goal; order++)
-		/* NOTHING */;
-
-	do {
-		rt_hash_mask = (1UL << order) * PAGE_SIZE /
-			sizeof(struct rt_hash_bucket);
-		while (rt_hash_mask & (rt_hash_mask - 1))
-			rt_hash_mask--;
-		rt_hash_table = (struct rt_hash_bucket *)
-			__get_free_pages(GFP_ATOMIC, order);
-	} while (rt_hash_table == NULL && --order > 0);
-
-	if (!rt_hash_table)
-		panic("Failed to allocate IP route cache hash table\n");
-
-	printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
-	       rt_hash_mask,
-	       (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
-
-	for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
-		/* NOTHING */;
-
-	rt_hash_mask--;
+	rt_hash_table = (struct rt_hash_bucket *)
+		alloc_large_system_hash("IP route cache",
+					sizeof(struct rt_hash_bucket),
+					rhash_entries,
+					(num_physpages >= 128 * 1024) ?
+						(27 - PAGE_SHIFT) :
+						(29 - PAGE_SHIFT),
+					HASH_HIGHMEM,
+					&rt_hash_log,
+					&rt_hash_mask,
+					0);
 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
 	rt_hash_lock_init();
 
-- 
cgit 


From bb1d23b02657f494dff295f6cdd1f29df30fa61e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 5 Jul 2005 15:00:32 -0700
Subject: [IPV4]: Bug fix in rt_check_expire()

- rt_check_expire() fixes (an overflow occured if size of the hash
  was >= 65536)

reminder of the bugfix:

The rt_check_expire() has a serious problem on machines with large
route caches, and a standard HZ value of 1000.

With default values, ie ip_rt_gc_interval = 60*HZ = 60000 ;

the loop count :

     for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;


overflows (t is a 31 bit value) as soon rt_hash_log is >= 16  (65536
slots in route cache hash table).

In this case, rt_check_expire() does nothing at all

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 9fcbb1b0a8d6..726ea5e8180a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -54,7 +54,7 @@
  *		Marc Boucher	:	routing by fwmark
  *	Robert Olsson		:	Added rt_cache statistics
  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
- *	Eric Dumazet		:	hashed spinlocks
+ *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -606,18 +606,25 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
 /* This runs via a timer and thus is always in BH context. */
 static void rt_check_expire(unsigned long dummy)
 {
-	static int rover;
-	int i = rover, t;
+	static unsigned int rover;
+	unsigned int i = rover, goal;
 	struct rtable *rth, **rthp;
 	unsigned long now = jiffies;
-
-	for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
-	     t -= ip_rt_gc_timeout) {
+	u64 mult;
+
+	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
+	if (ip_rt_gc_timeout > 1)
+		do_div(mult, ip_rt_gc_timeout);
+	goal = (unsigned int)mult;
+	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
+	for (; goal > 0; goal--) {
 		unsigned long tmo = ip_rt_gc_timeout;
 
 		i = (i + 1) & rt_hash_mask;
 		rthp = &rt_hash_table[i].chain;
 
+		if (*rthp == 0)
+			continue;
 		spin_lock(rt_hash_lock_addr(i));
 		while ((rth = *rthp) != NULL) {
 			if (rth->u.dst.expires) {
@@ -658,7 +665,7 @@ static void rt_check_expire(unsigned long dummy)
 			break;
 	}
 	rover = i;
-	mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
+	mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
 }
 
 /* This can run from both BH and non-BH contexts, the latter
-- 
cgit 


From db1322b8012e1a8ad711c04813817328cff46718 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 5 Jul 2005 15:01:25 -0700
Subject: [DECNET]: Fix memset overflow on 64bit archs while dumping decnet
 routing rules

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/decnet/dn_fib.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 9934b25720e4..99bc061759c3 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -551,7 +551,8 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
 		if (t < s_t)
 			continue;
 		if (t > s_t)
-			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int));
+			memset(&cb->args[1], 0,
+			       sizeof(cb->args) - sizeof(cb->args[0]));
 		tb = dn_fib_get_table(t, 0);
 		if (tb == NULL)
 			continue;
-- 
cgit 


From 2f36895aa774cf4d1c3d68921e0209e796b66600 Mon Sep 17 00:00:00 2001
From: Robert Olsson <robert.olsson@its.uu.se>
Date: Tue, 5 Jul 2005 15:02:40 -0700
Subject: [IPV4]: More broken memory allocation fixes for fib_trie

Below a patch to preallocate memory when doing resize of trie (inflate halve)
If preallocations fails it just skips the resize of this tnode for this time.

The oops we got when killing bgpd (with full routing) is now gone.
Patrick memory patch is also used.

Signed-off-by: Robert Olsson <robert.olsson@its.uu.se>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 177 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 145 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 9038b914b4b1..4be234c7d8c3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
  *		2 of the License, or (at your option) any later version.
  */
 
-#define VERSION "0.324"
+#define VERSION "0.325"
 
 #include <linux/config.h>
 #include <asm/uaccess.h>
@@ -136,6 +136,7 @@ struct trie_use_stats {
 	unsigned int semantic_match_passed;
 	unsigned int semantic_match_miss;
 	unsigned int null_node_hit;
+	unsigned int resize_node_skipped;
 };
 #endif
 
@@ -164,8 +165,8 @@ static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
 static int tnode_child_length(struct tnode *tn);
 static struct node *resize(struct trie *t, struct tnode *tn);
-static struct tnode *inflate(struct trie *t, struct tnode *tn);
-static struct tnode *halve(struct trie *t, struct tnode *tn);
+static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
+static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
 static void tnode_free(struct tnode *tn);
 static void trie_dump_seq(struct seq_file *seq, struct trie *t);
 extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
@@ -481,6 +482,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
 static struct node *resize(struct trie *t, struct tnode *tn) 
 {
 	int i;
+	int err = 0;
 
  	if (!tn)
 		return NULL;
@@ -577,12 +579,20 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	 */
 
 	check_tnode(tn);
-
+	
+	err = 0;
 	while ((tn->full_children > 0 &&
 	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
 				inflate_threshold * tnode_child_length(tn))) {
 
-		tn = inflate(t, tn);
+		tn = inflate(t, tn, &err);
+
+		if(err) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			t->stats.resize_node_skipped++;
+#endif
+			break;
+		}
 	}
 
 	check_tnode(tn);
@@ -591,11 +601,22 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	 * Halve as long as the number of empty children in this
 	 * node is above threshold.
 	 */
+
+	err = 0;
 	while (tn->bits > 1 &&
 	       100 * (tnode_child_length(tn) - tn->empty_children) <
-	       halve_threshold * tnode_child_length(tn))
+	       halve_threshold * tnode_child_length(tn)) {
+
+		tn = halve(t, tn, &err);
+
+		if(err) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			t->stats.resize_node_skipped++;
+#endif
+			break;
+		}
+	}
 
-		tn = halve(t, tn);
   
 	/* Only one child remains */
 
@@ -620,7 +641,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	return (struct node *) tn;
 }
 
-static struct tnode *inflate(struct trie *t, struct tnode *tn)
+static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
 {
 	struct tnode *inode;
 	struct tnode *oldtnode = tn;
@@ -632,8 +653,63 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 
 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 
-	if (!tn)
-		trie_bug("tnode_new failed");
+	if (!tn) {
+		*err = -ENOMEM;
+		return oldtnode;
+	}
+
+	/*
+	 * Preallocate and store tnodes before the actual work so we 
+	 * don't get into an inconsistent state if memory allocation 
+	 * fails. In case of failure we return the oldnode and  inflate 
+	 * of tnode is ignored.
+	 */
+			
+	for(i = 0; i < olen; i++) {
+		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
+
+		if (inode &&
+		    IS_TNODE(inode) &&
+		    inode->pos == oldtnode->pos + oldtnode->bits &&
+		    inode->bits > 1) {
+			struct tnode *left, *right;
+
+			t_key m = TKEY_GET_MASK(inode->pos, 1);
+ 
+			left = tnode_new(inode->key&(~m), inode->pos + 1,
+					 inode->bits - 1);
+
+			if(!left) {
+				*err = -ENOMEM; 
+				break;
+			}
+			
+			right = tnode_new(inode->key|m, inode->pos + 1,
+					  inode->bits - 1);
+
+			if(!right) {
+				*err = -ENOMEM; 
+				break;
+			}
+
+			put_child(t, tn, 2*i, (struct node *) left);
+			put_child(t, tn, 2*i+1, (struct node *) right);
+		}
+	}
+
+	if(*err) {
+		int size = tnode_child_length(tn);
+		int j;
+
+		for(j = 0; j < size; j++) 
+			if( tn->child[j])
+				tnode_free((struct tnode *)tn->child[j]);
+
+		tnode_free(tn);
+		
+		*err = -ENOMEM;
+		return oldtnode;
+	}
 
 	for(i = 0; i < olen; i++) {
 		struct node *node = tnode_get_child(oldtnode, i);
@@ -646,7 +722,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 
 		if(IS_LEAF(node) || ((struct tnode *) node)->pos >
 		   tn->pos + tn->bits - 1) {
-			if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1,
+			if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
 					     1) == 0)
 				put_child(t, tn, 2*i, node);
 			else
@@ -686,27 +762,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 			 * the position (inode->pos)
 			 */
 
-			t_key m = TKEY_GET_MASK(inode->pos, 1);
- 
 			/* Use the old key, but set the new significant 
 			 *   bit to zero. 
 			 */
-			left = tnode_new(inode->key&(~m), inode->pos + 1,
-					 inode->bits - 1);
 
-			if(!left) 
-				trie_bug("tnode_new failed");
-			
-			
-			/* Use the old key, but set the new significant 
-			 * bit to one. 
-			 */
-			right = tnode_new(inode->key|m, inode->pos + 1,
-					  inode->bits - 1);
+			left = (struct tnode *) tnode_get_child(tn, 2*i);
+			put_child(t, tn, 2*i, NULL);
+
+			if(!left)
+				BUG();
+
+			right = (struct tnode *) tnode_get_child(tn, 2*i+1);
+			put_child(t, tn, 2*i+1, NULL);
+
+			if(!right)
+				BUG();
 
-			if(!right) 
-				trie_bug("tnode_new failed");
-			
 			size = tnode_child_length(left);
 			for(j = 0; j < size; j++) {
 				put_child(t, left, j, inode->child[j]);
@@ -722,7 +793,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 	return tn;
 }
 
-static struct tnode *halve(struct trie *t, struct tnode *tn)
+static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
 {
 	struct tnode *oldtnode = tn;
 	struct node *left, *right;
@@ -733,8 +804,48 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
   
 	tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 
-	if(!tn) 
-		trie_bug("tnode_new failed");
+	if (!tn) {
+		*err = -ENOMEM;
+		return oldtnode;
+	}
+
+	/*
+	 * Preallocate and store tnodes before the actual work so we 
+	 * don't get into an inconsistent state if memory allocation 
+	 * fails. In case of failure we return the oldnode and halve 
+	 * of tnode is ignored.
+	 */
+
+	for(i = 0; i < olen; i += 2) {
+		left = tnode_get_child(oldtnode, i);
+		right = tnode_get_child(oldtnode, i+1);
+    
+		/* Two nonempty children */
+		if( left && right)  {
+			struct tnode *newBinNode =
+				tnode_new(left->key, tn->pos + tn->bits, 1);
+
+			if(!newBinNode) {
+				*err = -ENOMEM; 
+				break;
+			}
+			put_child(t, tn, i/2, (struct node *)newBinNode);
+		}
+	}
+
+	if(*err) {
+		int size = tnode_child_length(tn);
+		int j;
+
+		for(j = 0; j < size; j++) 
+			if( tn->child[j])
+				tnode_free((struct tnode *)tn->child[j]);
+
+		tnode_free(tn);
+		
+		*err = -ENOMEM;
+		return oldtnode;
+	}
 
 	for(i = 0; i < olen; i += 2) {
 		left = tnode_get_child(oldtnode, i);
@@ -751,10 +862,11 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
 		/* Two nonempty children */
 		else {
 			struct tnode *newBinNode =
-				tnode_new(left->key, tn->pos + tn->bits, 1);
+				(struct tnode *) tnode_get_child(tn, i/2);
+			put_child(t, tn, i/2, NULL);
 
 			if(!newBinNode) 
-				trie_bug("tnode_new failed");
+				BUG();
 
 			put_child(t, newBinNode, 0, left);
 			put_child(t, newBinNode, 1, right);
@@ -2322,6 +2434,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
 	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
 	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
 	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
+	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
 #ifdef CLEAR_STATS
 	memset(&(t->stats), 0, sizeof(t->stats));
 #endif
-- 
cgit 


From 52609c0b56d7c8dfb6e16ec0a715adf8fcbdae36 Mon Sep 17 00:00:00 2001
From: David Chau <ddcc@mit.edu>
Date: Tue, 5 Jul 2005 15:11:06 -0700
Subject: [NET]: improve readability of dev_set_promiscuity() in net/core/dev.c

A trivial patch to improve the readability of dev_set_promiscuity()
in net/core/dev.c. New code does exactly the same thing as original
code.

Signed-off-by: David Chau <ddcc@mit.edu>
Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 7016e0c36b3d..7f5f62c65115 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2089,10 +2089,11 @@ void dev_set_promiscuity(struct net_device *dev, int inc)
 {
 	unsigned short old_flags = dev->flags;
 
-	dev->flags |= IFF_PROMISC;
 	if ((dev->promiscuity += inc) == 0)
 		dev->flags &= ~IFF_PROMISC;
-	if (dev->flags ^ old_flags) {
+	else
+		dev->flags |= IFF_PROMISC;
+	if (dev->flags != old_flags) {
 		dev_mc_upload(dev);
 		printk(KERN_INFO "device %s %s promiscuous mode\n",
 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
-- 
cgit 


From c65f7f00c587828e3d50737805a78f74804972de Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:17:25 -0700
Subject: [TCP]: Simplify SKB data portion allocation with NETIF_F_SG.

The ideal and most optimal layout for an SKB when doing
scatter-gather is to put all the headers at skb->data, and
all the user data in the page array.

This makes SKB splitting and combining extremely simple,
especially before a packet goes onto the wire the first
time.

So, when sk_stream_alloc_pskb() is given a zero size, make
sure there is no skb_tailroom().  This is achieved by applying
SKB_DATA_ALIGN() to the header length used here.

Next, make select_size() in TCP output segmentation use a
length of zero when NETIF_F_SG is true on the outgoing
interface.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 882436da9a3a..be354155b2f9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -756,13 +756,9 @@ static inline int select_size(struct sock *sk, struct tcp_sock *tp)
 {
 	int tmp = tp->mss_cache_std;
 
-	if (sk->sk_route_caps & NETIF_F_SG) {
-		int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
+	if (sk->sk_route_caps & NETIF_F_SG)
+		tmp = 0;
 
-		if (tmp >= pgbreak &&
-		    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
-			tmp = pgbreak;
-	}
 	return tmp;
 }
 
@@ -872,11 +868,6 @@ new_segment:
 					tcp_mark_push(tp, skb);
 					goto new_segment;
 				} else if (page) {
-					/* If page is cached, align
-					 * offset to L1 cache boundary
-					 */
-					off = (off + L1_CACHE_BYTES - 1) &
-					      ~(L1_CACHE_BYTES - 1);
 					if (off == PAGE_SIZE) {
 						put_page(page);
 						TCP_PAGE(sk) = page = NULL;
-- 
cgit 


From fc6415bcb0f58f03adb910e56d7e1df6368794e0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:17:45 -0700
Subject: [TCP]: Fix quick-ack decrementing with TSO.

On each packet output, we call tcp_dec_quickack_mode()
if the ACK flag is set.  It drops tp->ack.quick until
it hits zero, at which time we deflate the ATO value.

When doing TSO, we are emitting multiple packets with
ACK set, so we should decrement tp->ack.quick that many
segments.

Note that, unlike this case, tcp_enter_cwr() should not
take the tcp_skb_pcount(skb) into consideration.  That
function, one time, readjusts tp->snd_cwnd and moves
into TCP_CA_CWR state.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0e17c244875c..389deeb2a457 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -140,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
 		tp->ack.pingpong = 1;
 }
 
-static __inline__ void tcp_event_ack_sent(struct sock *sk)
+static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	tcp_dec_quickack_mode(tp);
+	tcp_dec_quickack_mode(tp, pkts);
 	tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
 }
 
@@ -355,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 		tp->af_specific->send_check(sk, th, skb->len, skb);
 
 		if (tcb->flags & TCPCB_FLAG_ACK)
-			tcp_event_ack_sent(sk);
+			tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
 
 		if (skb->len != tcp_header_size)
 			tcp_event_data_sent(tp, skb, sk);
-- 
cgit 


From f6302d1d78f77c2d4c8bd32b0afc2df7fdf5f281 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:18:03 -0700
Subject: [TCP]: Move send test logic out of net/tcp.h

This just moves the code into tcp_output.c, no code logic changes are
made by this patch.

Using this as a baseline, we can begin to untangle the mess of
comparisons for the Nagle test et al.  We will also be able to reduce
all of the redundant computation that occurs when outputting data
packets.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 150 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 129 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 389deeb2a457..2cbe879ee16a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -413,6 +413,135 @@ static inline void tcp_tso_set_push(struct sk_buff *skb)
 		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 }
 
+static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (skb->len <= tp->mss_cache_std ||
+	    !(sk->sk_route_caps & NETIF_F_TSO)) {
+		/* Avoid the costly divide in the normal
+		 * non-TSO case.
+		 */
+		skb_shinfo(skb)->tso_segs = 1;
+		skb_shinfo(skb)->tso_size = 0;
+	} else {
+		unsigned int factor;
+
+		factor = skb->len + (tp->mss_cache_std - 1);
+		factor /= tp->mss_cache_std;
+		skb_shinfo(skb)->tso_segs = factor;
+		skb_shinfo(skb)->tso_size = tp->mss_cache_std;
+	}
+}
+
+static inline int tcp_minshall_check(const struct tcp_sock *tp)
+{
+	return after(tp->snd_sml,tp->snd_una) &&
+		!after(tp->snd_sml, tp->snd_nxt);
+}
+
+/* Return 0, if packet can be sent now without violation Nagle's rules:
+ * 1. It is full sized.
+ * 2. Or it contains FIN.
+ * 3. Or TCP_NODELAY was set.
+ * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
+ *    With Minshall's modification: all sent small packets are ACKed.
+ */
+
+static inline int tcp_nagle_check(const struct tcp_sock *tp,
+				  const struct sk_buff *skb, 
+				  unsigned mss_now, int nonagle)
+{
+	return (skb->len < mss_now &&
+		!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
+		((nonagle&TCP_NAGLE_CORK) ||
+		 (!nonagle &&
+		  tp->packets_out &&
+		  tcp_minshall_check(tp))));
+}
+
+/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
+ * should be put on the wire right now.
+ */
+static int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
+			unsigned cur_mss, int nonagle)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int pkts = tcp_skb_pcount(skb);
+
+	if (!pkts) {
+		tcp_set_skb_tso_segs(sk, skb);
+		pkts = tcp_skb_pcount(skb);
+	}
+
+	/*	RFC 1122 - section 4.2.3.4
+	 *
+	 *	We must queue if
+	 *
+	 *	a) The right edge of this frame exceeds the window
+	 *	b) There are packets in flight and we have a small segment
+	 *	   [SWS avoidance and Nagle algorithm]
+	 *	   (part of SWS is done on packetization)
+	 *	   Minshall version sounds: there are no _small_
+	 *	   segments in flight. (tcp_nagle_check)
+	 *	c) We have too many packets 'in flight'
+	 *
+	 * 	Don't use the nagle rule for urgent data (or
+	 *	for the final FIN -DaveM).
+	 *
+	 *	Also, Nagle rule does not apply to frames, which
+	 *	sit in the middle of queue (they have no chances
+	 *	to get new data) and if room at tail of skb is
+	 *	not enough to save something seriously (<32 for now).
+	 */
+
+	/* Don't be strict about the congestion window for the
+	 * final FIN frame.  -DaveM
+	 */
+	return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode
+		 || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) &&
+		(((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) ||
+		 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) &&
+		!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd));
+}
+
+static inline int tcp_skb_is_last(const struct sock *sk, 
+				  const struct sk_buff *skb)
+{
+	return skb->next == (struct sk_buff *)&sk->sk_write_queue;
+}
+
+/* Push out any pending frames which were held back due to
+ * TCP_CORK or attempt at coalescing tiny packets.
+ * The socket must be locked by the caller.
+ */
+void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
+			       unsigned cur_mss, int nonagle)
+{
+	struct sk_buff *skb = sk->sk_send_head;
+
+	if (skb) {
+		if (!tcp_skb_is_last(sk, skb))
+			nonagle = TCP_NAGLE_PUSH;
+		if (!tcp_snd_test(sk, skb, cur_mss, nonagle) ||
+		    tcp_write_xmit(sk, nonagle))
+			tcp_check_probe_timer(sk, tp);
+	}
+	tcp_cwnd_validate(sk, tp);
+}
+
+int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
+{
+	struct sk_buff *skb = sk->sk_send_head;
+
+	return (skb &&
+		tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
+			     (tcp_skb_is_last(sk, skb) ?
+			      TCP_NAGLE_PUSH :
+			      tp->nonagle)));
+}
+
+
 /* Send _single_ skb sitting at the send head. This function requires
  * true push pending frames to setup probe timer etc.
  */
@@ -434,27 +563,6 @@ void tcp_push_one(struct sock *sk, unsigned cur_mss)
 	}
 }
 
-void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (skb->len <= tp->mss_cache_std ||
-	    !(sk->sk_route_caps & NETIF_F_TSO)) {
-		/* Avoid the costly divide in the normal
-		 * non-TSO case.
-		 */
-		skb_shinfo(skb)->tso_segs = 1;
-		skb_shinfo(skb)->tso_size = 0;
-	} else {
-		unsigned int factor;
-
-		factor = skb->len + (tp->mss_cache_std - 1);
-		factor /= tp->mss_cache_std;
-		skb_shinfo(skb)->tso_segs = factor;
-		skb_shinfo(skb)->tso_size = tp->mss_cache_std;
-	}
-}
-
 /* Function to create two new TCP segments.  Shrinks the given segment
  * to the specified size and appends a new segment with the rest of the
  * packet to the list.  This won't be called frequently, I hope. 
-- 
cgit 


From 84d3e7b9573291a1ea845bdd51b74bb484597661 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:18:18 -0700
Subject: [TCP]: Move __tcp_data_snd_check into tcp_output.c

It reimplements portions of tcp_snd_check(), so it
we move it to tcp_output.c we can consolidate it's
logic much easier in a later change.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c  | 10 ----------
 net/ipv4/tcp_output.c | 10 ++++++++++
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7bbbbc33eb4b..577424323d59 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3346,16 +3346,6 @@ static inline void tcp_check_space(struct sock *sk)
 	}
 }
 
-static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
-	    tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
-	    tcp_write_xmit(sk, tp->nonagle))
-		tcp_check_probe_timer(sk, tp);
-}
-
 static __inline__ void tcp_data_snd_check(struct sock *sk)
 {
 	struct sk_buff *skb = sk->sk_send_head;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2cbe879ee16a..362b811a2460 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -530,6 +530,16 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
 	tcp_cwnd_validate(sk, tp);
 }
 
+void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
+	    tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
+	    tcp_write_xmit(sk, tp->nonagle))
+		tcp_check_probe_timer(sk, tp);
+}
+
 int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
 {
 	struct sk_buff *skb = sk->sk_send_head;
-- 
cgit 


From f44b527177d57ed382bfd93e1b55232465f6d058 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:18:34 -0700
Subject: [TCP]: Add missing skb_header_release() call to tcp_fragment().

When we add any new packet to the TCP socket write queue,
we must call skb_header_release() on it in order for the
TSO sharing checks in the drivers to work.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 362b811a2460..5e63ed09658d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -655,6 +655,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 	}
 
 	/* Link BUFF into the send queue. */
+	skb_header_release(buff);
 	__skb_append(skb, buff);
 
 	return 0;
-- 
cgit 


From a762a9800752f05fa8768bb0ac35d0e7f1bcfe7f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:18:51 -0700
Subject: [TCP]: Kill extra cwnd validate in __tcp_push_pending_frames().

The tcp_cwnd_validate() function should only be invoked
if we actually send some frames, yet __tcp_push_pending_frames()
will always invoke it.  tcp_write_xmit() does the call for us,
so the call here can simply be removed.

Also, tcp_write_xmit() can be marked static.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 79 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 49 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5e63ed09658d..a6375ca2a59e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -511,35 +511,6 @@ static inline int tcp_skb_is_last(const struct sock *sk,
 	return skb->next == (struct sk_buff *)&sk->sk_write_queue;
 }
 
-/* Push out any pending frames which were held back due to
- * TCP_CORK or attempt at coalescing tiny packets.
- * The socket must be locked by the caller.
- */
-void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
-			       unsigned cur_mss, int nonagle)
-{
-	struct sk_buff *skb = sk->sk_send_head;
-
-	if (skb) {
-		if (!tcp_skb_is_last(sk, skb))
-			nonagle = TCP_NAGLE_PUSH;
-		if (!tcp_snd_test(sk, skb, cur_mss, nonagle) ||
-		    tcp_write_xmit(sk, nonagle))
-			tcp_check_probe_timer(sk, tp);
-	}
-	tcp_cwnd_validate(sk, tp);
-}
-
-void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
-	    tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
-	    tcp_write_xmit(sk, tp->nonagle))
-		tcp_check_probe_timer(sk, tp);
-}
-
 int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
 {
 	struct sk_buff *skb = sk->sk_send_head;
@@ -841,6 +812,26 @@ unsigned int tcp_current_mss(struct sock *sk, int large)
 	return mss_now;
 }
 
+/* Congestion window validation. (RFC2861) */
+
+static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
+{
+	__u32 packets_out = tp->packets_out;
+
+	if (packets_out >= tp->snd_cwnd) {
+		/* Network is feed fully. */
+		tp->snd_cwnd_used = 0;
+		tp->snd_cwnd_stamp = tcp_time_stamp;
+	} else {
+		/* Network starves. */
+		if (tp->packets_out > tp->snd_cwnd_used)
+			tp->snd_cwnd_used = tp->packets_out;
+
+		if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
+			tcp_cwnd_application_limited(sk);
+	}
+}
+
 /* This routine writes packets to the network.  It advances the
  * send_head.  This happens as incoming acks open up the remote
  * window for us.
@@ -848,7 +839,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large)
  * Returns 1, if no segments are in flight and we have queued segments, but
  * cannot send anything now because of SWS or another problem.
  */
-int tcp_write_xmit(struct sock *sk, int nonagle)
+static int tcp_write_xmit(struct sock *sk, int nonagle)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned int mss_now;
@@ -901,6 +892,34 @@ int tcp_write_xmit(struct sock *sk, int nonagle)
 	return 0;
 }
 
+/* Push out any pending frames which were held back due to
+ * TCP_CORK or attempt at coalescing tiny packets.
+ * The socket must be locked by the caller.
+ */
+void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
+			       unsigned cur_mss, int nonagle)
+{
+	struct sk_buff *skb = sk->sk_send_head;
+
+	if (skb) {
+		if (!tcp_skb_is_last(sk, skb))
+			nonagle = TCP_NAGLE_PUSH;
+		if (!tcp_snd_test(sk, skb, cur_mss, nonagle) ||
+		    tcp_write_xmit(sk, nonagle))
+			tcp_check_probe_timer(sk, tp);
+	}
+}
+
+void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
+	    tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
+	    tcp_write_xmit(sk, tp->nonagle))
+		tcp_check_probe_timer(sk, tp);
+}
+
 /* This function returns the amount that we can raise the
  * usable window based on the following constraints
  *  
-- 
cgit 


From 92df7b518dcb113de8bc2494e3cd275ad887f12b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:19:06 -0700
Subject: [TCP]: tcp_write_xmit() tabbing cleanup

Put the main basic block of work at the top-level of
tabbing, and mark the TCP_CLOSE test with unlikely().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 68 +++++++++++++++++++++++++--------------------------
 1 file changed, 34 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a6375ca2a59e..2a8409c3af1a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -842,54 +842,54 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
 static int tcp_write_xmit(struct sock *sk, int nonagle)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
 	unsigned int mss_now;
+	int sent_pkts;
 
 	/* If we are closed, the bytes will have to remain here.
 	 * In time closedown will finish, we empty the write queue and all
 	 * will be happy.
 	 */
-	if (sk->sk_state != TCP_CLOSE) {
-		struct sk_buff *skb;
-		int sent_pkts = 0;
+	if (unlikely(sk->sk_state == TCP_CLOSE))
+		return 0;
 
-		/* Account for SACKS, we may need to fragment due to this.
-		 * It is just like the real MSS changing on us midstream.
-		 * We also handle things correctly when the user adds some
-		 * IP options mid-stream.  Silly to do, but cover it.
-		 */
-		mss_now = tcp_current_mss(sk, 1);
-
-		while ((skb = sk->sk_send_head) &&
-		       tcp_snd_test(sk, skb, mss_now,
-			       	    tcp_skb_is_last(sk, skb) ? nonagle :
-				    			       TCP_NAGLE_PUSH)) {
-			if (skb->len > mss_now) {
-				if (tcp_fragment(sk, skb, mss_now))
-					break;
-			}
 
-			TCP_SKB_CB(skb)->when = tcp_time_stamp;
-			tcp_tso_set_push(skb);
-			if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
+	/* Account for SACKS, we may need to fragment due to this.
+	 * It is just like the real MSS changing on us midstream.
+	 * We also handle things correctly when the user adds some
+	 * IP options mid-stream.  Silly to do, but cover it.
+	 */
+	mss_now = tcp_current_mss(sk, 1);
+	sent_pkts = 0;
+	while ((skb = sk->sk_send_head) &&
+	       tcp_snd_test(sk, skb, mss_now,
+			    tcp_skb_is_last(sk, skb) ? nonagle :
+			    TCP_NAGLE_PUSH)) {
+		if (skb->len > mss_now) {
+			if (tcp_fragment(sk, skb, mss_now))
 				break;
+		}
 
-			/* Advance the send_head.  This one is sent out.
-			 * This call will increment packets_out.
-			 */
-			update_send_head(sk, tp, skb);
+		TCP_SKB_CB(skb)->when = tcp_time_stamp;
+		tcp_tso_set_push(skb);
+		if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
+			break;
 
-			tcp_minshall_update(tp, mss_now, skb);
-			sent_pkts = 1;
-		}
+		/* Advance the send_head.  This one is sent out.
+		 * This call will increment packets_out.
+		 */
+		update_send_head(sk, tp, skb);
 
-		if (sent_pkts) {
-			tcp_cwnd_validate(sk, tp);
-			return 0;
-		}
+		tcp_minshall_update(tp, mss_now, skb);
+		sent_pkts = 1;
+	}
 
-		return !tp->packets_out && sk->sk_send_head;
+	if (sent_pkts) {
+		tcp_cwnd_validate(sk, tp);
+		return 0;
 	}
-	return 0;
+
+	return !tp->packets_out && sk->sk_send_head;
 }
 
 /* Push out any pending frames which were held back due to
-- 
cgit 


From a2e2a59c93cc8ba39caa9011c2573f429e40ccd9 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:19:23 -0700
Subject: [TCP]: Fix redundant calculations of tcp_current_mss()

tcp_write_xmit() uses tcp_current_mss(), but some of it's callers,
namely __tcp_push_pending_frames(), already has this value available
already.

While we're here, fix the "cur_mss" argument to be "unsigned int"
instead of plain "unsigned".

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2a8409c3af1a..e292e11c7319 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -839,11 +839,10 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
  * Returns 1, if no segments are in flight and we have queued segments, but
  * cannot send anything now because of SWS or another problem.
  */
-static int tcp_write_xmit(struct sock *sk, int nonagle)
+static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	unsigned int mss_now;
 	int sent_pkts;
 
 	/* If we are closed, the bytes will have to remain here.
@@ -853,13 +852,6 @@ static int tcp_write_xmit(struct sock *sk, int nonagle)
 	if (unlikely(sk->sk_state == TCP_CLOSE))
 		return 0;
 
-
-	/* Account for SACKS, we may need to fragment due to this.
-	 * It is just like the real MSS changing on us midstream.
-	 * We also handle things correctly when the user adds some
-	 * IP options mid-stream.  Silly to do, but cover it.
-	 */
-	mss_now = tcp_current_mss(sk, 1);
 	sent_pkts = 0;
 	while ((skb = sk->sk_send_head) &&
 	       tcp_snd_test(sk, skb, mss_now,
@@ -897,7 +889,7 @@ static int tcp_write_xmit(struct sock *sk, int nonagle)
  * The socket must be locked by the caller.
  */
 void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
-			       unsigned cur_mss, int nonagle)
+			       unsigned int cur_mss, int nonagle)
 {
 	struct sk_buff *skb = sk->sk_send_head;
 
@@ -905,7 +897,7 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
 		if (!tcp_skb_is_last(sk, skb))
 			nonagle = TCP_NAGLE_PUSH;
 		if (!tcp_snd_test(sk, skb, cur_mss, nonagle) ||
-		    tcp_write_xmit(sk, nonagle))
+		    tcp_write_xmit(sk, cur_mss, nonagle))
 			tcp_check_probe_timer(sk, tp);
 	}
 }
@@ -916,7 +908,7 @@ void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
 
 	if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
 	    tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
-	    tcp_write_xmit(sk, tp->nonagle))
+	    tcp_write_xmit(sk, tcp_current_mss(sk, 1), tp->nonagle))
 		tcp_check_probe_timer(sk, tp);
 }
 
-- 
cgit 


From 55c97f3e990c1ff63957c64f6cb10711a09fd70e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:19:38 -0700
Subject: [TCP]: Fix __tcp_push_pending_frames() 'nonagle' handling.

'nonagle' should be passed to the tcp_snd_test() function
as 'TCP_NAGLE_PUSH' if we are checking an SKB not at the
tail of the write_queue.  This is because Nagle does not
apply to such frames since we cannot possibly tack more
data onto them.

However, while doing this __tcp_push_pending_frames() makes
all of the packets in the write_queue use this modified
'nonagle' value.

Fix the bug and simplify this function by just calling
tcp_write_xmit() directly if sk_send_head is non-NULL.

As a result, we can now make tcp_data_snd_check() just call
tcp_push_pending_frames() instead of the specialized
__tcp_data_snd_check().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c  | 17 +++++++----------
 net/ipv4/tcp_output.c | 15 +--------------
 2 files changed, 8 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 577424323d59..b27be2f819ac 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3346,12 +3346,9 @@ static inline void tcp_check_space(struct sock *sk)
 	}
 }
 
-static __inline__ void tcp_data_snd_check(struct sock *sk)
+static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
 {
-	struct sk_buff *skb = sk->sk_send_head;
-
-	if (skb != NULL)
-		__tcp_data_snd_check(sk, skb);
+	tcp_push_pending_frames(sk, tp);
 	tcp_check_space(sk);
 }
 
@@ -3645,7 +3642,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				 */
 				tcp_ack(sk, skb, 0);
 				__kfree_skb(skb); 
-				tcp_data_snd_check(sk);
+				tcp_data_snd_check(sk, tp);
 				return 0;
 			} else { /* Header too small */
 				TCP_INC_STATS_BH(TCP_MIB_INERRS);
@@ -3711,7 +3708,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
 				/* Well, only one small jumplet in fast path... */
 				tcp_ack(sk, skb, FLAG_DATA);
-				tcp_data_snd_check(sk);
+				tcp_data_snd_check(sk, tp);
 				if (!tcp_ack_scheduled(tp))
 					goto no_ack;
 			}
@@ -3789,7 +3786,7 @@ step5:
 	/* step 7: process the segment text */
 	tcp_data_queue(sk, skb);
 
-	tcp_data_snd_check(sk);
+	tcp_data_snd_check(sk, tp);
 	tcp_ack_snd_check(sk);
 	return 0;
 
@@ -4099,7 +4096,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		/* Do step6 onward by hand. */
 		tcp_urg(sk, skb, th);
 		__kfree_skb(skb);
-		tcp_data_snd_check(sk);
+		tcp_data_snd_check(sk, tp);
 		return 0;
 	}
 
@@ -4290,7 +4287,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 	/* tcp_data could move socket to TIME-WAIT */
 	if (sk->sk_state != TCP_CLOSE) {
-		tcp_data_snd_check(sk);
+		tcp_data_snd_check(sk, tp);
 		tcp_ack_snd_check(sk);
 	}
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e292e11c7319..ce1d7cfbecfc 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -894,24 +894,11 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
 	struct sk_buff *skb = sk->sk_send_head;
 
 	if (skb) {
-		if (!tcp_skb_is_last(sk, skb))
-			nonagle = TCP_NAGLE_PUSH;
-		if (!tcp_snd_test(sk, skb, cur_mss, nonagle) ||
-		    tcp_write_xmit(sk, cur_mss, nonagle))
+		if (tcp_write_xmit(sk, cur_mss, nonagle))
 			tcp_check_probe_timer(sk, tp);
 	}
 }
 
-void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
-	    tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
-	    tcp_write_xmit(sk, tcp_current_mss(sk, 1), tp->nonagle))
-		tcp_check_probe_timer(sk, tp);
-}
-
 /* This function returns the amount that we can raise the
  * usable window based on the following constraints
  *  
-- 
cgit 


From 7f4dd0a9438c73cbb1c240ece31390cf2c57294e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:19:54 -0700
Subject: [TCP]: Break out tcp_snd_test() into it's constituent parts.

tcp_snd_test() does several different things, use inline
functions to express this more clearly.

1) It initializes the TSO count of SKB, if necessary.
2) It performs the Nagle test.
3) It makes sure the congestion window is adhered to.
4) It makes sure SKB fits into the send window.

This cleanup also sets things up so that things like the
available packets in the congestion window does not need
to be calculated multiple times by packet sending loops
such as tcp_write_xmit().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 120 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 82 insertions(+), 38 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ce1d7cfbecfc..8327e5e86d15 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -434,6 +434,33 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
 	}
 }
 
+/* Does SKB fit into the send window? */
+static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
+{
+	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+	return !after(end_seq, tp->snd_una + tp->snd_wnd);
+}
+
+/* Can at least one segment of SKB be sent right now, according to the
+ * congestion window rules?  If so, return how many segments are allowed.
+ */
+static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	u32 in_flight, cwnd;
+
+	/* Don't be strict about the congestion window for the final FIN.  */
+	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+		return 1;
+
+	in_flight = tcp_packets_in_flight(tp);
+	cwnd = tp->snd_cwnd;
+	if (in_flight < cwnd)
+		return (cwnd - in_flight);
+
+	return 0;
+}
+
 static inline int tcp_minshall_check(const struct tcp_sock *tp)
 {
 	return after(tp->snd_sml,tp->snd_una) &&
@@ -442,7 +469,7 @@ static inline int tcp_minshall_check(const struct tcp_sock *tp)
 
 /* Return 0, if packet can be sent now without violation Nagle's rules:
  * 1. It is full sized.
- * 2. Or it contains FIN.
+ * 2. Or it contains FIN. (already checked by caller)
  * 3. Or TCP_NODELAY was set.
  * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
  *    With Minshall's modification: all sent small packets are ACKed.
@@ -453,56 +480,73 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp,
 				  unsigned mss_now, int nonagle)
 {
 	return (skb->len < mss_now &&
-		!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
 		((nonagle&TCP_NAGLE_CORK) ||
 		 (!nonagle &&
 		  tp->packets_out &&
 		  tcp_minshall_check(tp))));
 }
 
-/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
- * should be put on the wire right now.
+/* Return non-zero if the Nagle test allows this packet to be
+ * sent now.
  */
-static int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
-			unsigned cur_mss, int nonagle)
+static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
+				 unsigned int cur_mss, int nonagle)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-	int pkts = tcp_skb_pcount(skb);
+	/* Nagle rule does not apply to frames, which sit in the middle of the
+	 * write_queue (they have no chances to get new data).
+	 *
+	 * This is implemented in the callers, where they modify the 'nonagle'
+	 * argument based upon the location of SKB in the send queue.
+	 */
+	if (nonagle & TCP_NAGLE_PUSH)
+		return 1;
+
+	/* Don't use the nagle rule for urgent data (or for the final FIN).  */
+	if (tp->urg_mode ||
+	    (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
+		return 1;
+
+	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
+		return 1;
 
-	if (!pkts) {
+	return 0;
+}
+
+/* This must be invoked the first time we consider transmitting
+ * SKB onto the wire.
+ */
+static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
+{
+	int tso_segs = tcp_skb_pcount(skb);
+
+	if (!tso_segs) {
 		tcp_set_skb_tso_segs(sk, skb);
-		pkts = tcp_skb_pcount(skb);
+		tso_segs = tcp_skb_pcount(skb);
 	}
+	return tso_segs;
+}
 
-	/*	RFC 1122 - section 4.2.3.4
-	 *
-	 *	We must queue if
-	 *
-	 *	a) The right edge of this frame exceeds the window
-	 *	b) There are packets in flight and we have a small segment
-	 *	   [SWS avoidance and Nagle algorithm]
-	 *	   (part of SWS is done on packetization)
-	 *	   Minshall version sounds: there are no _small_
-	 *	   segments in flight. (tcp_nagle_check)
-	 *	c) We have too many packets 'in flight'
-	 *
-	 * 	Don't use the nagle rule for urgent data (or
-	 *	for the final FIN -DaveM).
-	 *
-	 *	Also, Nagle rule does not apply to frames, which
-	 *	sit in the middle of queue (they have no chances
-	 *	to get new data) and if room at tail of skb is
-	 *	not enough to save something seriously (<32 for now).
-	 */
+/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
+ * should be put on the wire right now.  If so, it returns the number of
+ * packets allowed by the congestion window.
+ */
+static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
+				 unsigned int cur_mss, int nonagle)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int cwnd_quota;
 
-	/* Don't be strict about the congestion window for the
-	 * final FIN frame.  -DaveM
-	 */
-	return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode
-		 || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) &&
-		(((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) ||
-		 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) &&
-		!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd));
+	tcp_init_tso_segs(sk, skb);
+
+	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
+		return 0;
+
+	cwnd_quota = tcp_cwnd_test(tp, skb);
+	if (cwnd_quota &&
+	    !tcp_snd_wnd_test(tp, skb, cur_mss))
+		cwnd_quota = 0;
+
+	return cwnd_quota;
 }
 
 static inline int tcp_skb_is_last(const struct sock *sk, 
-- 
cgit 


From aa93466bdfd901b926e033801f0b82b3eaa67be2 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:20:09 -0700
Subject: [TCP]: Eliminate redundant computations in tcp_write_xmit().

tcp_snd_test() is run for every packet output by a single
call to tcp_write_xmit(), but this is not necessary.

For one, the congestion window space needs to only be
calculated one time, then used throughout the duration
of the loop.

This cleanup also makes experimenting with different TSO
packetization schemes much easier.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 40 +++++++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8327e5e86d15..0a4cd24b6578 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -887,6 +887,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
+	unsigned int tso_segs, cwnd_quota;
 	int sent_pkts;
 
 	/* If we are closed, the bytes will have to remain here.
@@ -896,19 +897,31 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 	if (unlikely(sk->sk_state == TCP_CLOSE))
 		return 0;
 
+	skb = sk->sk_send_head;
+	if (unlikely(!skb))
+		return 0;
+
+	tso_segs = tcp_init_tso_segs(sk, skb);
+	cwnd_quota = tcp_cwnd_test(tp, skb);
 	sent_pkts = 0;
-	while ((skb = sk->sk_send_head) &&
-	       tcp_snd_test(sk, skb, mss_now,
-			    tcp_skb_is_last(sk, skb) ? nonagle :
-			    TCP_NAGLE_PUSH)) {
-		if (skb->len > mss_now) {
-			if (tcp_fragment(sk, skb, mss_now))
+
+	while (cwnd_quota >= tso_segs) {
+		if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
+					     (tcp_skb_is_last(sk, skb) ?
+					      nonagle : TCP_NAGLE_PUSH))))
+			break;
+
+		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+			break;
+
+		if (unlikely(skb->len > mss_now)) {
+			if (unlikely(tcp_fragment(sk, skb,  mss_now)))
 				break;
 		}
 
 		TCP_SKB_CB(skb)->when = tcp_time_stamp;
 		tcp_tso_set_push(skb);
-		if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
+		if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
 			break;
 
 		/* Advance the send_head.  This one is sent out.
@@ -917,10 +930,19 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 		update_send_head(sk, tp, skb);
 
 		tcp_minshall_update(tp, mss_now, skb);
-		sent_pkts = 1;
+		sent_pkts++;
+
+		/* Do not optimize this to use tso_segs. If we chopped up
+		 * the packet above, tso_segs will no longer be valid.
+		 */
+		cwnd_quota -= tcp_skb_pcount(skb);
+		skb = sk->sk_send_head;
+		if (!skb)
+			break;
+		tso_segs = tcp_init_tso_segs(sk, skb);
 	}
 
-	if (sent_pkts) {
+	if (likely(sent_pkts)) {
 		tcp_cwnd_validate(sk, tp);
 		return 0;
 	}
-- 
cgit 


From b4e26f5ea0dbdd1e813c5571fb467022d8eb948a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:20:27 -0700
Subject: [TCP]: Fix send-side cpu utiliziation regression.

Only put user data purely to pages when doing TSO.

The extra page allocations cause two problems:

1) Add the overhead of the page allocations themselves.
2) Make us do small user copies when we get to the end
   of the TCP socket cache page.

It is still beneficial to purely use pages for TSO,
so we will do it for that case.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index be354155b2f9..2ba73bf3a8f9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -756,8 +756,17 @@ static inline int select_size(struct sock *sk, struct tcp_sock *tp)
 {
 	int tmp = tp->mss_cache_std;
 
-	if (sk->sk_route_caps & NETIF_F_SG)
-		tmp = 0;
+	if (sk->sk_route_caps & NETIF_F_SG) {
+		if (sk->sk_route_caps & NETIF_F_TSO)
+			tmp = 0;
+		else {
+			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
+
+			if (tmp >= pgbreak &&
+			    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
+				tmp = pgbreak;
+		}
+	}
 
 	return tmp;
 }
-- 
cgit 


From a56476962e92a6c389a1a561274d4a27607b7b5f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:20:41 -0700
Subject: [TCP]: Kill bogus comment above tcp_tso_acked().

Everything stated there is out of data.  tcp_trim_skb()
does adjust the available socket send buffer space and
skb->truesize now.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b27be2f819ac..1dba7fd438da 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1957,15 +1957,6 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
 	}
 }
 
-/* There is one downside to this scheme.  Although we keep the
- * ACK clock ticking, adjusting packet counters and advancing
- * congestion window, we do not liberate socket send buffer
- * space.
- *
- * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
- * then making a write space wakeup callback is a possible
- * future enhancement.  WARNING: it is not trivial to make.
- */
 static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
 			 __u32 now, __s32 *seq_rtt)
 {
-- 
cgit 


From cb83199a29dc0408423d6df432f28cc67fcadaf4 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:20:55 -0700
Subject: [TCP]: Do not call tcp_tso_acked() if no work to do.

In tcp_clean_rtx_queue(), if the TSO packet is not even partially
acked, do not waste time calling tcp_tso_acked().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1dba7fd438da..b948e4eb39b7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2038,7 +2038,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
 		 * the other end.
 		 */
 		if (after(scb->end_seq, tp->snd_una)) {
-			if (tcp_skb_pcount(skb) > 1)
+			if (tcp_skb_pcount(skb) > 1 &&
+			    after(tp->snd_una, scb->seq))
 				acked |= tcp_tso_acked(sk, skb,
 						       now, &seq_rtt);
 			break;
-- 
cgit 


From 0d9901df62fe4820aee86b49f1a074cdb5c6928e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:21:10 -0700
Subject: [TCP]: Break out send buffer expansion test.

This makes it easier to understand, and allows easier
tweaking of the heuristic later on.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b948e4eb39b7..2ef2f355b8b8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3300,6 +3300,28 @@ void tcp_cwnd_application_limited(struct sock *sk)
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
+static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
+{
+	/* If the user specified a specific send buffer setting, do
+	 * not modify it.
+	 */
+	if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+		return 0;
+
+	/* If we are under global TCP memory pressure, do not expand.  */
+	if (tcp_memory_pressure)
+		return 0;
+
+	/* If we are under soft global TCP memory pressure, do not expand.  */
+	if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
+		return 0;
+
+	/* If we filled the congestion window, do not expand.  */
+	if (tp->packets_out >= tp->snd_cwnd)
+		return 0;
+
+	return 1;
+}
 
 /* When incoming ACK allowed to free some skb from write_queue,
  * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
@@ -3311,10 +3333,7 @@ static void tcp_new_space(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (tp->packets_out < tp->snd_cwnd &&
-	    !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
-	    !tcp_memory_pressure &&
-	    atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+	if (tcp_should_expand_sndbuf(sk, tp)) {
  		int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
 			MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
 		    demanded = max_t(unsigned int, tp->snd_cwnd,
-- 
cgit 


From c1b4a7e69576d65efc31a8cea0714173c2841244 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:24:38 -0700
Subject: [TCP]: Move to new TSO segmenting scheme.

Make TSO segment transmit size decisions at send time not earlier.

The basic scheme is that we try to build as large a TSO frame as
possible when pulling in the user data, but the size of the TSO frame
output to the card is determined at transmit time.

This is guided by tp->xmit_size_goal.  It is always set to a multiple
of MSS and tells sendmsg/sendpage how large an SKB to try and build.

Later, tcp_write_xmit() and tcp_push_one() chop up the packet if
necessary and conditions warrant.  These routines can also decide to
"defer" in order to wait for more ACKs to arrive and thus allow larger
TSO frames to be emitted.

A general observation is that TSO elongates the pipe, thus requiring a
larger congestion window and larger buffering especially at the sender
side.  Therefore, it is important that applications 1) get a large
enough socket send buffer (this is accomplished by our dynamic send
buffer expansion code) 2) do large enough writes.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c        |  26 ++-
 net/ipv4/tcp_input.c  |  10 +-
 net/ipv4/tcp_ipv4.c   |   2 +-
 net/ipv4/tcp_output.c | 578 +++++++++++++++++++++++++++++++-------------------
 net/ipv6/tcp_ipv6.c   |   2 +-
 5 files changed, 381 insertions(+), 237 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2ba73bf3a8f9..29894c749163 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 			 size_t psize, int flags)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	int mss_now;
+	int mss_now, size_goal;
 	int err;
 	ssize_t copied;
 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+	size_goal = tp->xmit_size_goal;
 	copied = 0;
 
 	err = -EPIPE;
@@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 		int offset = poffset % PAGE_SIZE;
 		int size = min_t(size_t, psize, PAGE_SIZE - offset);
 
-		if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
+		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
 new_segment:
 			if (!sk_stream_memory_free(sk))
 				goto wait_for_sndbuf;
@@ -652,7 +653,7 @@ new_segment:
 				goto wait_for_memory;
 
 			skb_entail(sk, tp, skb);
-			copy = mss_now;
+			copy = size_goal;
 		}
 
 		if (copy > size)
@@ -693,7 +694,7 @@ new_segment:
 		if (!(psize -= copy))
 			goto out;
 
-		if (skb->len != mss_now || (flags & MSG_OOB))
+		if (skb->len < mss_now || (flags & MSG_OOB))
 			continue;
 
 		if (forced_push(tp)) {
@@ -713,6 +714,7 @@ wait_for_memory:
 			goto do_error;
 
 		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+		size_goal = tp->xmit_size_goal;
 	}
 
 out:
@@ -754,7 +756,7 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 
 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
 {
-	int tmp = tp->mss_cache_std;
+	int tmp = tp->mss_cache;
 
 	if (sk->sk_route_caps & NETIF_F_SG) {
 		if (sk->sk_route_caps & NETIF_F_TSO)
@@ -778,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	int iovlen, flags;
-	int mss_now;
+	int mss_now, size_goal;
 	int err, copied;
 	long timeo;
 
@@ -797,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+	size_goal = tp->xmit_size_goal;
 
 	/* Ok commence sending. */
 	iovlen = msg->msg_iovlen;
@@ -819,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			skb = sk->sk_write_queue.prev;
 
 			if (!sk->sk_send_head ||
-			    (copy = mss_now - skb->len) <= 0) {
+			    (copy = size_goal - skb->len) <= 0) {
 
 new_segment:
 				/* Allocate new segment. If the interface is SG,
@@ -842,7 +845,7 @@ new_segment:
 					skb->ip_summed = CHECKSUM_HW;
 
 				skb_entail(sk, tp, skb);
-				copy = mss_now;
+				copy = size_goal;
 			}
 
 			/* Try to append data to the end of skb. */
@@ -937,7 +940,7 @@ new_segment:
 			if ((seglen -= copy) == 0 && iovlen == 0)
 				goto out;
 
-			if (skb->len != mss_now || (flags & MSG_OOB))
+			if (skb->len < mss_now || (flags & MSG_OOB))
 				continue;
 
 			if (forced_push(tp)) {
@@ -957,6 +960,7 @@ wait_for_memory:
 				goto do_error;
 
 			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+			size_goal = tp->xmit_size_goal;
 		}
 	}
 
@@ -2128,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 
 	info->tcpi_rto = jiffies_to_usecs(tp->rto);
 	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
-	info->tcpi_snd_mss = tp->mss_cache_std;
+	info->tcpi_snd_mss = tp->mss_cache;
 	info->tcpi_rcv_mss = tp->ack.rcv_mss;
 
 	info->tcpi_unacked = tp->packets_out;
@@ -2178,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 
 	switch (optname) {
 	case TCP_MAXSEG:
-		val = tp->mss_cache_std;
+		val = tp->mss_cache;
 		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
 			val = tp->rx_opt.user_mss;
 		break;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2ef2f355b8b8..8de2f1071c2b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -740,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
 	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
 
 	if (!cwnd) {
-		if (tp->mss_cache_std > 1460)
+		if (tp->mss_cache > 1460)
 			cwnd = 2;
 		else
-			cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
+			cwnd = (tp->mss_cache > 1095) ? 3 : 4;
 	}
 	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
@@ -914,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 	if (sk->sk_route_caps & NETIF_F_TSO) {
 		sk->sk_route_caps &= ~NETIF_F_TSO;
 		sock_set_flag(sk, SOCK_NO_LARGESEND);
-		tp->mss_cache = tp->mss_cache_std;
+		tp->mss_cache = tp->mss_cache;
 	}
 
 	if (!tp->sacked_out)
@@ -1077,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 			    (IsFack(tp) ||
 			     !before(lost_retrans,
 				     TCP_SKB_CB(skb)->ack_seq + tp->reordering *
-				     tp->mss_cache_std))) {
+				     tp->mss_cache))) {
 				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
 				tp->retrans_out -= tcp_skb_pcount(skb);
 
@@ -3334,7 +3334,7 @@ static void tcp_new_space(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (tcp_should_expand_sndbuf(sk, tp)) {
- 		int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
+ 		int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
 			MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
 		    demanded = max_t(unsigned int, tp->snd_cwnd,
 						   tp->reordering + 1);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ebf112347a97..62f62bb05c2a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2045,7 +2045,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	 */
 	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
 	tp->snd_cwnd_clamp = ~0;
-	tp->mss_cache_std = tp->mss_cache = 536;
+	tp->mss_cache = 536;
 
 	tp->reordering = sysctl_tcp_reordering;
 	tp->ca_ops = &tcp_init_congestion_ops;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0a4cd24b6578..fd3ce38184ae 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
  * will allow a single TSO frame to consume.  Building TSO frames
  * which are too large can cause TCP streams to be bursty.
  */
-int sysctl_tcp_tso_win_divisor = 8;
+int sysctl_tcp_tso_win_divisor = 3;
 
 static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
 				    struct sk_buff *skb)
@@ -403,21 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
 		sk->sk_send_head = skb;
 }
 
-static inline void tcp_tso_set_push(struct sk_buff *skb)
-{
-	/* Force push to be on for any TSO frames to workaround
-	 * problems with busted implementations like Mac OS-X that
-	 * hold off socket receive wakeups until push is seen.
-	 */
-	if (tcp_skb_pcount(skb) > 1)
-		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
-}
-
 static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (skb->len <= tp->mss_cache_std ||
+	if (skb->len <= tp->mss_cache ||
 	    !(sk->sk_route_caps & NETIF_F_TSO)) {
 		/* Avoid the costly divide in the normal
 		 * non-TSO case.
@@ -427,164 +417,10 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
 	} else {
 		unsigned int factor;
 
-		factor = skb->len + (tp->mss_cache_std - 1);
-		factor /= tp->mss_cache_std;
+		factor = skb->len + (tp->mss_cache - 1);
+		factor /= tp->mss_cache;
 		skb_shinfo(skb)->tso_segs = factor;
-		skb_shinfo(skb)->tso_size = tp->mss_cache_std;
-	}
-}
-
-/* Does SKB fit into the send window? */
-static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
-{
-	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
-
-	return !after(end_seq, tp->snd_una + tp->snd_wnd);
-}
-
-/* Can at least one segment of SKB be sent right now, according to the
- * congestion window rules?  If so, return how many segments are allowed.
- */
-static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
-{
-	u32 in_flight, cwnd;
-
-	/* Don't be strict about the congestion window for the final FIN.  */
-	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
-		return 1;
-
-	in_flight = tcp_packets_in_flight(tp);
-	cwnd = tp->snd_cwnd;
-	if (in_flight < cwnd)
-		return (cwnd - in_flight);
-
-	return 0;
-}
-
-static inline int tcp_minshall_check(const struct tcp_sock *tp)
-{
-	return after(tp->snd_sml,tp->snd_una) &&
-		!after(tp->snd_sml, tp->snd_nxt);
-}
-
-/* Return 0, if packet can be sent now without violation Nagle's rules:
- * 1. It is full sized.
- * 2. Or it contains FIN. (already checked by caller)
- * 3. Or TCP_NODELAY was set.
- * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
- *    With Minshall's modification: all sent small packets are ACKed.
- */
-
-static inline int tcp_nagle_check(const struct tcp_sock *tp,
-				  const struct sk_buff *skb, 
-				  unsigned mss_now, int nonagle)
-{
-	return (skb->len < mss_now &&
-		((nonagle&TCP_NAGLE_CORK) ||
-		 (!nonagle &&
-		  tp->packets_out &&
-		  tcp_minshall_check(tp))));
-}
-
-/* Return non-zero if the Nagle test allows this packet to be
- * sent now.
- */
-static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
-				 unsigned int cur_mss, int nonagle)
-{
-	/* Nagle rule does not apply to frames, which sit in the middle of the
-	 * write_queue (they have no chances to get new data).
-	 *
-	 * This is implemented in the callers, where they modify the 'nonagle'
-	 * argument based upon the location of SKB in the send queue.
-	 */
-	if (nonagle & TCP_NAGLE_PUSH)
-		return 1;
-
-	/* Don't use the nagle rule for urgent data (or for the final FIN).  */
-	if (tp->urg_mode ||
-	    (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
-		return 1;
-
-	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
-		return 1;
-
-	return 0;
-}
-
-/* This must be invoked the first time we consider transmitting
- * SKB onto the wire.
- */
-static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
-{
-	int tso_segs = tcp_skb_pcount(skb);
-
-	if (!tso_segs) {
-		tcp_set_skb_tso_segs(sk, skb);
-		tso_segs = tcp_skb_pcount(skb);
-	}
-	return tso_segs;
-}
-
-/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
- * should be put on the wire right now.  If so, it returns the number of
- * packets allowed by the congestion window.
- */
-static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
-				 unsigned int cur_mss, int nonagle)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	unsigned int cwnd_quota;
-
-	tcp_init_tso_segs(sk, skb);
-
-	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
-		return 0;
-
-	cwnd_quota = tcp_cwnd_test(tp, skb);
-	if (cwnd_quota &&
-	    !tcp_snd_wnd_test(tp, skb, cur_mss))
-		cwnd_quota = 0;
-
-	return cwnd_quota;
-}
-
-static inline int tcp_skb_is_last(const struct sock *sk, 
-				  const struct sk_buff *skb)
-{
-	return skb->next == (struct sk_buff *)&sk->sk_write_queue;
-}
-
-int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
-{
-	struct sk_buff *skb = sk->sk_send_head;
-
-	return (skb &&
-		tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
-			     (tcp_skb_is_last(sk, skb) ?
-			      TCP_NAGLE_PUSH :
-			      tp->nonagle)));
-}
-
-
-/* Send _single_ skb sitting at the send head. This function requires
- * true push pending frames to setup probe timer etc.
- */
-void tcp_push_one(struct sock *sk, unsigned cur_mss)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb = sk->sk_send_head;
-
-	if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) {
-		/* Send it out now. */
-		TCP_SKB_CB(skb)->when = tcp_time_stamp;
-		tcp_tso_set_push(skb);
-		if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
-			sk->sk_send_head = NULL;
-			tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
-			tcp_packets_out_inc(sk, tp, skb);
-			return;
-		}
+		skb_shinfo(skb)->tso_size = tp->mss_cache;
 	}
 }
 
@@ -791,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 
 	/* And store cached results */
 	tp->pmtu_cookie = pmtu;
-	tp->mss_cache = tp->mss_cache_std = mss_now;
+	tp->mss_cache = mss_now;
 
 	return mss_now;
 }
@@ -803,56 +639,47 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
  * cannot be large. However, taking into account rare use of URG, this
  * is not a big flaw.
  */
-
-unsigned int tcp_current_mss(struct sock *sk, int large)
+unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
-	unsigned int do_large, mss_now;
+	u32 mss_now;
+	u16 xmit_size_goal;
+	int doing_tso = 0;
+
+	mss_now = tp->mss_cache;
+
+	if (large_allowed &&
+	    (sk->sk_route_caps & NETIF_F_TSO) &&
+	    !tp->urg_mode)
+		doing_tso = 1;
 
-	mss_now = tp->mss_cache_std;
 	if (dst) {
 		u32 mtu = dst_mtu(dst);
 		if (mtu != tp->pmtu_cookie)
 			mss_now = tcp_sync_mss(sk, mtu);
 	}
 
-	do_large = (large &&
-		    (sk->sk_route_caps & NETIF_F_TSO) &&
-		    !tp->urg_mode);
+	if (tp->rx_opt.eff_sacks)
+		mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
+			    (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
 
-	if (do_large) {
-		unsigned int large_mss, factor, limit;
+	xmit_size_goal = mss_now;
 
-		large_mss = 65535 - tp->af_specific->net_header_len -
+	if (doing_tso) {
+		xmit_size_goal = 65535 -
+			tp->af_specific->net_header_len -
 			tp->ext_header_len - tp->tcp_header_len;
 
-		if (tp->max_window && large_mss > (tp->max_window>>1))
-			large_mss = max((tp->max_window>>1),
-					68U - tp->tcp_header_len);
-
-		factor = large_mss / mss_now;
+		if (tp->max_window &&
+		    (xmit_size_goal > (tp->max_window >> 1)))
+			xmit_size_goal = max((tp->max_window >> 1),
+					     68U - tp->tcp_header_len);
 
-		/* Always keep large mss multiple of real mss, but
-		 * do not exceed 1/tso_win_divisor of the congestion window
-		 * so we can keep the ACK clock ticking and minimize
-		 * bursting.
-		 */
-		limit = tp->snd_cwnd;
-		if (sysctl_tcp_tso_win_divisor)
-			limit /= sysctl_tcp_tso_win_divisor;
-		limit = max(1U, limit);
-		if (factor > limit)
-			factor = limit;
-
-		tp->mss_cache = mss_now * factor;
-
-		mss_now = tp->mss_cache;
+		xmit_size_goal -= (xmit_size_goal % mss_now);
 	}
+	tp->xmit_size_goal = xmit_size_goal;
 
-	if (tp->rx_opt.eff_sacks)
-		mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
-			    (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
 	return mss_now;
 }
 
@@ -876,6 +703,251 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
 	}
 }
 
+static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
+{
+	u32 window, cwnd_len;
+
+	window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
+	cwnd_len = mss_now * cwnd;
+	return min(window, cwnd_len);
+}
+
+/* Can at least one segment of SKB be sent right now, according to the
+ * congestion window rules?  If so, return how many segments are allowed.
+ */
+static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	u32 in_flight, cwnd;
+
+	/* Don't be strict about the congestion window for the final FIN.  */
+	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+		return 1;
+
+	in_flight = tcp_packets_in_flight(tp);
+	cwnd = tp->snd_cwnd;
+	if (in_flight < cwnd)
+		return (cwnd - in_flight);
+
+	return 0;
+}
+
+/* This must be invoked the first time we consider transmitting
+ * SKB onto the wire.
+ */
+static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
+{
+	int tso_segs = tcp_skb_pcount(skb);
+
+	if (!tso_segs) {
+		tcp_set_skb_tso_segs(sk, skb);
+		tso_segs = tcp_skb_pcount(skb);
+	}
+	return tso_segs;
+}
+
+static inline int tcp_minshall_check(const struct tcp_sock *tp)
+{
+	return after(tp->snd_sml,tp->snd_una) &&
+		!after(tp->snd_sml, tp->snd_nxt);
+}
+
+/* Return 0, if packet can be sent now without violation Nagle's rules:
+ * 1. It is full sized.
+ * 2. Or it contains FIN. (already checked by caller)
+ * 3. Or TCP_NODELAY was set.
+ * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
+ *    With Minshall's modification: all sent small packets are ACKed.
+ */
+
+static inline int tcp_nagle_check(const struct tcp_sock *tp,
+				  const struct sk_buff *skb, 
+				  unsigned mss_now, int nonagle)
+{
+	return (skb->len < mss_now &&
+		((nonagle&TCP_NAGLE_CORK) ||
+		 (!nonagle &&
+		  tp->packets_out &&
+		  tcp_minshall_check(tp))));
+}
+
+/* Return non-zero if the Nagle test allows this packet to be
+ * sent now.
+ */
+static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
+				 unsigned int cur_mss, int nonagle)
+{
+	/* Nagle rule does not apply to frames, which sit in the middle of the
+	 * write_queue (they have no chances to get new data).
+	 *
+	 * This is implemented in the callers, where they modify the 'nonagle'
+	 * argument based upon the location of SKB in the send queue.
+	 */
+	if (nonagle & TCP_NAGLE_PUSH)
+		return 1;
+
+	/* Don't use the nagle rule for urgent data (or for the final FIN).  */
+	if (tp->urg_mode ||
+	    (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
+		return 1;
+
+	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
+		return 1;
+
+	return 0;
+}
+
+/* Does at least the first segment of SKB fit into the send window? */
+static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
+{
+	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+	if (skb->len > cur_mss)
+		end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
+
+	return !after(end_seq, tp->snd_una + tp->snd_wnd);
+}
+
+/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
+ * should be put on the wire right now.  If so, it returns the number of
+ * packets allowed by the congestion window.
+ */
+static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
+				 unsigned int cur_mss, int nonagle)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int cwnd_quota;
+
+	tcp_init_tso_segs(sk, skb);
+
+	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
+		return 0;
+
+	cwnd_quota = tcp_cwnd_test(tp, skb);
+	if (cwnd_quota &&
+	    !tcp_snd_wnd_test(tp, skb, cur_mss))
+		cwnd_quota = 0;
+
+	return cwnd_quota;
+}
+
+static inline int tcp_skb_is_last(const struct sock *sk, 
+				  const struct sk_buff *skb)
+{
+	return skb->next == (struct sk_buff *)&sk->sk_write_queue;
+}
+
+int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
+{
+	struct sk_buff *skb = sk->sk_send_head;
+
+	return (skb &&
+		tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
+			     (tcp_skb_is_last(sk, skb) ?
+			      TCP_NAGLE_PUSH :
+			      tp->nonagle)));
+}
+
+/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
+ * which is put after SKB on the list.  It is very much like
+ * tcp_fragment() except that it may make several kinds of assumptions
+ * in order to speed up the splitting operation.  In particular, we
+ * know that all the data is in scatter-gather pages, and that the
+ * packet has never been sent out before (and thus is not cloned).
+ */
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
+{
+	struct sk_buff *buff;
+	int nlen = skb->len - len;
+	u16 flags;
+
+	/* All of a TSO frame must be composed of paged data.  */
+	BUG_ON(skb->len != skb->data_len);
+
+	buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
+	if (unlikely(buff == NULL))
+		return -ENOMEM;
+
+	buff->truesize = nlen;
+	skb->truesize -= nlen;
+
+	/* Correct the sequence numbers. */
+	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+
+	/* PSH and FIN should only be set in the second packet. */
+	flags = TCP_SKB_CB(skb)->flags;
+	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+	TCP_SKB_CB(buff)->flags = flags;
+
+	/* This packet was never sent out yet, so no SACK bits. */
+	TCP_SKB_CB(buff)->sacked = 0;
+
+	buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
+	skb_split(skb, buff, len);
+
+	/* Fix up tso_factor for both original and new SKB.  */
+	tcp_set_skb_tso_segs(sk, skb);
+	tcp_set_skb_tso_segs(sk, buff);
+
+	/* Link BUFF into the send queue. */
+	skb_header_release(buff);
+	__skb_append(skb, buff);
+
+	return 0;
+}
+
+/* Try to defer sending, if possible, in order to minimize the amount
+ * of TSO splitting we do.  View it as a kind of TSO Nagle test.
+ *
+ * This algorithm is from John Heffner.
+ */
+static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
+{
+	u32 send_win, cong_win, limit, in_flight;
+
+	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+		return 0;
+
+	in_flight = tcp_packets_in_flight(tp);
+
+	BUG_ON(tcp_skb_pcount(skb) <= 1 ||
+	       (tp->snd_cwnd <= in_flight));
+
+	send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
+
+	/* From in_flight test above, we know that cwnd > in_flight.  */
+	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
+
+	limit = min(send_win, cong_win);
+
+	/* If sk_send_head can be sent fully now, just do it.  */
+	if (skb->len <= limit)
+		return 0;
+
+	if (sysctl_tcp_tso_win_divisor) {
+		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
+
+		/* If at least some fraction of a window is available,
+		 * just use it.
+		 */
+		chunk /= sysctl_tcp_tso_win_divisor;
+		if (limit >= chunk)
+			return 0;
+	} else {
+		/* Different approach, try not to defer past a single
+		 * ACK.  Receiver should ACK every other full sized
+		 * frame, so if we have space for more than 3 frames
+		 * then send now.
+		 */
+		if (limit > tcp_max_burst(tp) * tp->mss_cache)
+			return 0;
+	}
+
+	/* Ok, it looks like it is advisable to defer.  */
+	return 1;
+}
+
 /* This routine writes packets to the network.  It advances the
  * send_head.  This happens as incoming acks open up the remote
  * window for us.
@@ -887,8 +959,8 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	unsigned int tso_segs, cwnd_quota;
-	int sent_pkts;
+	unsigned int tso_segs, sent_pkts;
+	int cwnd_quota;
 
 	/* If we are closed, the bytes will have to remain here.
 	 * In time closedown will finish, we empty the write queue and all
@@ -903,24 +975,44 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 
 	tso_segs = tcp_init_tso_segs(sk, skb);
 	cwnd_quota = tcp_cwnd_test(tp, skb);
+	if (unlikely(!cwnd_quota))
+		goto out;
+
 	sent_pkts = 0;
+	while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
+		BUG_ON(!tso_segs);
 
-	while (cwnd_quota >= tso_segs) {
-		if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
-					     (tcp_skb_is_last(sk, skb) ?
-					      nonagle : TCP_NAGLE_PUSH))))
-			break;
+		if (tso_segs == 1) {
+			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
+						     (tcp_skb_is_last(sk, skb) ?
+						      nonagle : TCP_NAGLE_PUSH))))
+				break;
+		} else {
+			if (tcp_tso_should_defer(sk, tp, skb))
+				break;
+		}
 
-		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
-			break;
+		if (tso_segs > 1) {
+			u32 limit = tcp_window_allows(tp, skb,
+						      mss_now, cwnd_quota);
+
+			if (skb->len < limit) {
+				unsigned int trim = skb->len % mss_now;
 
-		if (unlikely(skb->len > mss_now)) {
+				if (trim)
+					limit = skb->len - trim;
+			}
+			if (skb->len > limit) {
+				if (tso_fragment(sk, skb, limit))
+					break;
+			}
+		} else if (unlikely(skb->len > mss_now)) {
 			if (unlikely(tcp_fragment(sk, skb,  mss_now)))
 				break;
 		}
 
 		TCP_SKB_CB(skb)->when = tcp_time_stamp;
-		tcp_tso_set_push(skb);
+
 		if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
 			break;
 
@@ -936,6 +1028,11 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 		 * the packet above, tso_segs will no longer be valid.
 		 */
 		cwnd_quota -= tcp_skb_pcount(skb);
+
+		BUG_ON(cwnd_quota < 0);
+		if (!cwnd_quota)
+			break;
+
 		skb = sk->sk_send_head;
 		if (!skb)
 			break;
@@ -946,7 +1043,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 		tcp_cwnd_validate(sk, tp);
 		return 0;
 	}
-
+out:
 	return !tp->packets_out && sk->sk_send_head;
 }
 
@@ -965,6 +1062,53 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
 	}
 }
 
+/* Send _single_ skb sitting at the send head. This function requires
+ * true push pending frames to setup probe timer etc.
+ */
+void tcp_push_one(struct sock *sk, unsigned int mss_now)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb = sk->sk_send_head;
+	unsigned int tso_segs, cwnd_quota;
+
+	BUG_ON(!skb || skb->len < mss_now);
+
+	tso_segs = tcp_init_tso_segs(sk, skb);
+	cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
+
+	if (likely(cwnd_quota)) {
+		BUG_ON(!tso_segs);
+
+		if (tso_segs > 1) {
+			u32 limit = tcp_window_allows(tp, skb,
+						      mss_now, cwnd_quota);
+
+			if (skb->len < limit) {
+				unsigned int trim = skb->len % mss_now;
+
+				if (trim)
+					limit = skb->len - trim;
+			}
+			if (skb->len > limit) {
+				if (unlikely(tso_fragment(sk, skb, limit)))
+					return;
+			}
+		} else if (unlikely(skb->len > mss_now)) {
+			if (unlikely(tcp_fragment(sk, skb, mss_now)))
+				return;
+		}
+
+		/* Send it out now. */
+		TCP_SKB_CB(skb)->when = tcp_time_stamp;
+
+		if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
+			update_send_head(sk, tp, skb);
+			tcp_cwnd_validate(sk, tp);
+			return;
+		}
+	}
+}
+
 /* This function returns the amount that we can raise the
  * usable window based on the following constraints
  *  
@@ -1222,7 +1366,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 		if (sk->sk_route_caps & NETIF_F_TSO) {
 			sk->sk_route_caps &= ~NETIF_F_TSO;
 			sock_set_flag(sk, SOCK_NO_LARGESEND);
-			tp->mss_cache = tp->mss_cache_std;
 		}
 
 		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
@@ -1284,7 +1427,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	 * is still in somebody's hands, else make a clone.
 	 */
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
-	tcp_tso_set_push(skb);
 
 	err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
 				    pskb_copy(skb, GFP_ATOMIC):
@@ -1853,14 +1995,12 @@ int tcp_write_wakeup(struct sock *sk)
 				if (sk->sk_route_caps & NETIF_F_TSO) {
 					sock_set_flag(sk, SOCK_NO_LARGESEND);
 					sk->sk_route_caps &= ~NETIF_F_TSO;
-					tp->mss_cache = tp->mss_cache_std;
 				}
 			} else if (!tcp_skb_pcount(skb))
 				tcp_set_skb_tso_segs(sk, skb);
 
 			TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 			TCP_SKB_CB(skb)->when = tcp_time_stamp;
-			tcp_tso_set_push(skb);
 			err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
 			if (!err) {
 				update_send_head(sk, tp, skb);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9dac7fdf4726..f6e288dc116e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2018,7 +2018,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 	 */
 	tp->snd_ssthresh = 0x7fffffff;
 	tp->snd_cwnd_clamp = ~0;
-	tp->mss_cache_std = tp->mss_cache = 536;
+	tp->mss_cache = 536;
 
 	tp->reordering = sysctl_tcp_reordering;
 
-- 
cgit 


From 63d886c96b2a580b1bf764de238ba3c63515b5ee Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 5 Jul 2005 15:29:16 -0700
Subject: [PKT_SCHED]: Blackhole queueing discipline

Useful in combination with classful qdiscs to drop or
temporary disable certain flows, e.g. one could block
specific ds flows with dsmark.

Unlike the noop qdisc it can be controlled by the user and
statistic accounting is done.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/Makefile        |  2 +-
 net/sched/sch_blackhole.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 net/sched/sch_blackhole.c

(limited to 'net')

diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8f58cecd6266..e48d0d456b3e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -4,7 +4,7 @@
 
 obj-y	:= sch_generic.o
 
-obj-$(CONFIG_NET_SCHED)		+= sch_api.o sch_fifo.o
+obj-$(CONFIG_NET_SCHED)		+= sch_api.o sch_fifo.o sch_blackhole.o
 obj-$(CONFIG_NET_CLS)		+= cls_api.o
 obj-$(CONFIG_NET_CLS_ACT)	+= act_api.o
 obj-$(CONFIG_NET_ACT_POLICE)	+= police.o
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
new file mode 100644
index 000000000000..81f0b8346d17
--- /dev/null
+++ b/net/sched/sch_blackhole.c
@@ -0,0 +1,54 @@
+/*
+ * net/sched/sch_blackhole.c	Black hole queue
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ *
+ * Note: Quantum tunneling is not supported.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	qdisc_drop(skb, sch);
+	return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
+{
+	return NULL;
+}
+
+static struct Qdisc_ops blackhole_qdisc_ops = {
+	.id		= "blackhole",
+	.priv_size	= 0,
+	.enqueue	= blackhole_enqueue,
+	.dequeue	= blackhole_dequeue,
+	.owner		= THIS_MODULE,
+};
+
+static int __init blackhole_module_init(void)
+{
+	return register_qdisc(&blackhole_qdisc_ops);
+}
+
+static void __exit blackhole_module_exit(void)
+{
+	unregister_qdisc(&blackhole_qdisc_ops);
+}
+
+module_init(blackhole_module_init)
+module_exit(blackhole_module_exit)
+
+MODULE_LICENSE("GPL");
-- 
cgit 


From 908a75c17a9e5a888347c2c1d3572203d1b1c7db Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:43:58 -0700
Subject: [TCP]: Never TSO defer under periods of congestion.

Congestion window recover after loss depends upon the fact
that if we have a full MSS sized frame at the head of the
send queue, we will send it.  TSO deferral can defeat the
ACK clocking necessary to exit cleanly from recovery.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fd3ce38184ae..e041d057ec86 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -909,6 +909,9 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_
 	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
 		return 0;
 
+	if (tp->ca_state != TCP_CA_Open)
+		return 0;
+
 	in_flight = tcp_packets_in_flight(tp);
 
 	BUG_ON(tcp_skb_pcount(skb) <= 1 ||
-- 
cgit 


From 7e8d7e3c9e38dab8d28a8667faa4941842f64213 Mon Sep 17 00:00:00 2001
From: "KAMBAROV, ZAUR" <kambarov@berkeley.edu>
Date: Thu, 7 Jul 2005 17:57:07 -0700
Subject: [PATCH] coverity: sunrpc/xprt task null check

In __xprt_lock_write() we check to see if `task' is NULL, but in other places
we just go and dereference it.

`task' shouldn't be NULL anyway, so remove this test.

This defect was found automatically by Coverity Prevent, a static analysis
tool.

Signed-off-by: Zaur Kambarov <zkambarov@coverity.com>
Acked-by: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/sunrpc/xprt.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 269f217918a3..3c654e06b084 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -145,8 +145,6 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
 	if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) {
 		if (task == xprt->snd_task)
 			return 1;
-		if (task == NULL)
-			return 0;
 		goto out_sleep;
 	}
 	if (xprt->nocong || __xprt_get_cong(xprt, task)) {
-- 
cgit 


From b03efcfb2180289718991bb984044ce6c5b7d1b0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 8 Jul 2005 14:57:23 -0700
Subject: [NET]: Transform skb_queue_len() binary tests into skb_queue_empty()

This is part of the grand scheme to eliminate the qlen
member of skb_queue_head, and subsequently remove the
'list' member of sk_buff.

Most users of skb_queue_len() want to know if the queue is
empty or not, and that's trivially done with skb_queue_empty()
which doesn't use the skb_queue_head->qlen member and instead
uses the queue list emptyness as the test.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bluetooth/cmtp/core.c   |  6 ++----
 net/bluetooth/hidp/core.c   |  5 +----
 net/bluetooth/rfcomm/sock.c |  7 +++++--
 net/bluetooth/rfcomm/tty.c  |  2 +-
 net/decnet/af_decnet.c      | 10 +++++-----
 net/decnet/dn_nsp_out.c     |  3 ++-
 net/ipv4/tcp.c              |  8 ++++----
 net/ipv4/tcp_input.c        | 11 +++++------
 net/ipv4/tcp_timer.c        |  5 ++---
 net/irda/irlap.c            |  3 +--
 net/irda/irlap_event.c      | 14 +++++++-------
 net/irda/irlap_frame.c      |  8 +++-----
 net/irda/irttp.c            |  2 +-
 net/llc/llc_c_ev.c          |  2 +-
 net/netlink/af_netlink.c    |  2 +-
 net/sched/sch_red.c         |  2 +-
 net/unix/af_unix.c          |  4 ++--
 17 files changed, 44 insertions(+), 50 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 2e341de3e763..901eff7ebe74 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -213,7 +213,7 @@ static int cmtp_send_frame(struct cmtp_session *session, unsigned char *data, in
 	return kernel_sendmsg(sock, &msg, &iv, 1, len);
 }
 
-static int cmtp_process_transmit(struct cmtp_session *session)
+static void cmtp_process_transmit(struct cmtp_session *session)
 {
 	struct sk_buff *skb, *nskb;
 	unsigned char *hdr;
@@ -223,7 +223,7 @@ static int cmtp_process_transmit(struct cmtp_session *session)
 
 	if (!(nskb = alloc_skb(session->mtu, GFP_ATOMIC))) {
 		BT_ERR("Can't allocate memory for new frame");
-		return -ENOMEM;
+		return;
 	}
 
 	while ((skb = skb_dequeue(&session->transmit))) {
@@ -275,8 +275,6 @@ static int cmtp_process_transmit(struct cmtp_session *session)
 	cmtp_send_frame(session, nskb->data, nskb->len);
 
 	kfree_skb(nskb);
-
-	return skb_queue_len(&session->transmit);
 }
 
 static int cmtp_session(void *arg)
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index affbc55462e8..de8af5f42394 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -428,7 +428,7 @@ static int hidp_send_frame(struct socket *sock, unsigned char *data, int len)
 	return kernel_sendmsg(sock, &msg, &iv, 1, len);
 }
 
-static int hidp_process_transmit(struct hidp_session *session)
+static void hidp_process_transmit(struct hidp_session *session)
 {
 	struct sk_buff *skb;
 
@@ -453,9 +453,6 @@ static int hidp_process_transmit(struct hidp_session *session)
 		hidp_set_timer(session);
 		kfree_skb(skb);
 	}
-
-	return skb_queue_len(&session->ctrl_transmit) +
-				skb_queue_len(&session->intr_transmit);
 }
 
 static int hidp_session(void *arg)
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index f3f6355a2786..63a123c5c41b 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -590,8 +590,11 @@ static long rfcomm_sock_data_wait(struct sock *sk, long timeo)
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		if (skb_queue_len(&sk->sk_receive_queue) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) ||
-				signal_pending(current) || !timeo)
+		if (!skb_queue_empty(&sk->sk_receive_queue) ||
+		    sk->sk_err ||
+		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+		    signal_pending(current) ||
+		    !timeo)
 			break;
 
 		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 6d689200bcf3..6304590fd36a 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -781,7 +781,7 @@ static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty)
 
 	BT_DBG("tty %p dev %p", tty, dev);
 
-	if (skb_queue_len(&dlc->tx_queue))
+	if (!skb_queue_empty(&dlc->tx_queue))
 		return dlc->mtu;
 
 	return 0;
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 29bb3cd21965..96a02800cd28 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -536,7 +536,7 @@ static void dn_keepalive(struct sock *sk)
 	 * we are double checking that we are not sending too
 	 * many of these keepalive frames.
 	 */
-	if (skb_queue_len(&scp->other_xmit_queue) == 0)
+	if (skb_queue_empty(&scp->other_xmit_queue))
 		dn_nsp_send_link(sk, DN_NOCHANGE, 0);
 }
 
@@ -1191,7 +1191,7 @@ static unsigned int dn_poll(struct file *file, struct socket *sock, poll_table
 	struct dn_scp *scp = DN_SK(sk);
 	int mask = datagram_poll(file, sock, wait);
 
-	if (skb_queue_len(&scp->other_receive_queue))
+	if (!skb_queue_empty(&scp->other_receive_queue))
 		mask |= POLLRDBAND;
 
 	return mask;
@@ -1214,7 +1214,7 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 
 	case SIOCATMARK:
 		lock_sock(sk);
-		val = (skb_queue_len(&scp->other_receive_queue) != 0);
+		val = !skb_queue_empty(&scp->other_receive_queue);
 		if (scp->state != DN_RUN)
 			val = -ENOTCONN;
 		release_sock(sk);
@@ -1630,7 +1630,7 @@ static int dn_data_ready(struct sock *sk, struct sk_buff_head *q, int flags, int
 	int len = 0;
 
 	if (flags & MSG_OOB)
-		return skb_queue_len(q) ? 1 : 0;
+		return !skb_queue_empty(q) ? 1 : 0;
 
 	while(skb != (struct sk_buff *)q) {
 		struct dn_skb_cb *cb = DN_SKB_CB(skb);
@@ -1707,7 +1707,7 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
 		if (sk->sk_err)
 			goto out;
 
-		if (skb_queue_len(&scp->other_receive_queue)) {
+		if (!skb_queue_empty(&scp->other_receive_queue)) {
 			if (!(flags & MSG_OOB)) {
 				msg->msg_flags |= MSG_OOB;
 				if (!scp->other_report) {
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 42abbf3f524f..8cce1fdbda90 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -342,7 +342,8 @@ int dn_nsp_xmit_timeout(struct sock *sk)
 
 	dn_nsp_output(sk);
 
-	if (skb_queue_len(&scp->data_xmit_queue) || skb_queue_len(&scp->other_xmit_queue))
+	if (!skb_queue_empty(&scp->data_xmit_queue) ||
+	    !skb_queue_empty(&scp->other_xmit_queue))
 		scp->persist = dn_nsp_persist(sk);
 
 	return 0;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 29894c749163..ddb6ce4ecff2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1105,7 +1105,7 @@ static void tcp_prequeue_process(struct sock *sk)
 	struct sk_buff *skb;
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
+	NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
 
 	/* RX process wants to run with disabled BHs, though it is not
 	 * necessary */
@@ -1369,7 +1369,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			 * is not empty. It is more elegant, but eats cycles,
 			 * unfortunately.
 			 */
-			if (skb_queue_len(&tp->ucopy.prequeue))
+			if (!skb_queue_empty(&tp->ucopy.prequeue))
 				goto do_prequeue;
 
 			/* __ Set realtime policy in scheduler __ */
@@ -1394,7 +1394,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			}
 
 			if (tp->rcv_nxt == tp->copied_seq &&
-			    skb_queue_len(&tp->ucopy.prequeue)) {
+			    !skb_queue_empty(&tp->ucopy.prequeue)) {
 do_prequeue:
 				tcp_prequeue_process(sk);
 
@@ -1476,7 +1476,7 @@ skip_copy:
 	} while (len > 0);
 
 	if (user_recv) {
-		if (skb_queue_len(&tp->ucopy.prequeue)) {
+		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 			int chunk;
 
 			tp->ucopy.len = copied > 0 ? len : 0;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8de2f1071c2b..53a8a5399f1e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2802,7 +2802,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
 	int this_sack;
 
 	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
-	if (skb_queue_len(&tp->out_of_order_queue) == 0) {
+	if (skb_queue_empty(&tp->out_of_order_queue)) {
 		tp->rx_opt.num_sacks = 0;
 		tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
 		return;
@@ -2935,13 +2935,13 @@ queue_and_out:
 		if(th->fin)
 			tcp_fin(skb, sk, th);
 
-		if (skb_queue_len(&tp->out_of_order_queue)) {
+		if (!skb_queue_empty(&tp->out_of_order_queue)) {
 			tcp_ofo_queue(sk);
 
 			/* RFC2581. 4.2. SHOULD send immediate ACK, when
 			 * gap in queue is filled.
 			 */
-			if (!skb_queue_len(&tp->out_of_order_queue))
+			if (skb_queue_empty(&tp->out_of_order_queue))
 				tp->ack.pingpong = 0;
 		}
 
@@ -3249,9 +3249,8 @@ static int tcp_prune_queue(struct sock *sk)
 	 * This must not ever occur. */
 
 	/* First, purge the out_of_order queue. */
-	if (skb_queue_len(&tp->out_of_order_queue)) {
-		NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED, 
-				 skb_queue_len(&tp->out_of_order_queue));
+	if (!skb_queue_empty(&tp->out_of_order_queue)) {
+		NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
 		__skb_queue_purge(&tp->out_of_order_queue);
 
 		/* Reset SACK state.  A conforming SACK implementation will
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b127b4498565..0084227438c2 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -231,11 +231,10 @@ static void tcp_delack_timer(unsigned long data)
 	}
 	tp->ack.pending &= ~TCP_ACK_TIMER;
 
-	if (skb_queue_len(&tp->ucopy.prequeue)) {
+	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 		struct sk_buff *skb;
 
-		NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED, 
-				 skb_queue_len(&tp->ucopy.prequeue));
+		NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
 
 		while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
 			sk->sk_backlog_rcv(sk, skb);
diff --git a/net/irda/irlap.c b/net/irda/irlap.c
index 046ad0750e48..7029618f5719 100644
--- a/net/irda/irlap.c
+++ b/net/irda/irlap.c
@@ -445,9 +445,8 @@ void irlap_disconnect_request(struct irlap_cb *self)
 	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
 
 	/* Don't disconnect until all data frames are successfully sent */
-	if (skb_queue_len(&self->txq) > 0) {
+	if (!skb_queue_empty(&self->txq)) {
 		self->disconnect_pending = TRUE;
-
 		return;
 	}
 
diff --git a/net/irda/irlap_event.c b/net/irda/irlap_event.c
index 1cd89f5f3b75..a505b5457608 100644
--- a/net/irda/irlap_event.c
+++ b/net/irda/irlap_event.c
@@ -191,7 +191,7 @@ static void irlap_start_poll_timer(struct irlap_cb *self, int timeout)
 	 * Send out the RR frames faster if our own transmit queue is empty, or
 	 * if the peer is busy. The effect is a much faster conversation
 	 */
-	if ((skb_queue_len(&self->txq) == 0) || (self->remote_busy)) {
+	if (skb_queue_empty(&self->txq) || self->remote_busy) {
 		if (self->fast_RR == TRUE) {
 			/*
 			 *  Assert that the fast poll timer has not reached the
@@ -263,7 +263,7 @@ void irlap_do_event(struct irlap_cb *self, IRLAP_EVENT event,
 		IRDA_DEBUG(2, "%s() : queue len = %d\n", __FUNCTION__,
 			   skb_queue_len(&self->txq));
 
-		if (skb_queue_len(&self->txq)) {
+		if (!skb_queue_empty(&self->txq)) {
 			/* Prevent race conditions with irlap_data_request() */
 			self->local_busy = TRUE;
 
@@ -1074,7 +1074,7 @@ static int irlap_state_xmit_p(struct irlap_cb *self, IRLAP_EVENT event,
 #else	/* CONFIG_IRDA_DYNAMIC_WINDOW */
 			/* Window has been adjusted for the max packet
 			 * size, so much simpler... - Jean II */
-			nextfit = (skb_queue_len(&self->txq) > 0);
+			nextfit = !skb_queue_empty(&self->txq);
 #endif	/* CONFIG_IRDA_DYNAMIC_WINDOW */
 			/*
 			 *  Send data with poll bit cleared only if window > 1
@@ -1814,7 +1814,7 @@ static int irlap_state_xmit_s(struct irlap_cb *self, IRLAP_EVENT event,
 #else	/* CONFIG_IRDA_DYNAMIC_WINDOW */
 			/* Window has been adjusted for the max packet
 			 * size, so much simpler... - Jean II */
-			nextfit = (skb_queue_len(&self->txq) > 0);
+			nextfit = !skb_queue_empty(&self->txq);
 #endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
 			/*
 			 *  Send data with final bit cleared only if window > 1
@@ -1937,7 +1937,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
 				irlap_data_indication(self, skb, FALSE);
 
 				/* Any pending data requests?  */
-				if ((skb_queue_len(&self->txq) > 0) &&
+				if (!skb_queue_empty(&self->txq) &&
 				    (self->window > 0))
 				{
 					self->ack_required = TRUE;
@@ -2038,7 +2038,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
 			/*
 			 *  Any pending data requests?
 			 */
-			if ((skb_queue_len(&self->txq) > 0) &&
+			if (!skb_queue_empty(&self->txq) &&
 			    (self->window > 0) && !self->remote_busy)
 			{
 				irlap_data_indication(self, skb, TRUE);
@@ -2069,7 +2069,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
 		 */
 		nr_status = irlap_validate_nr_received(self, info->nr);
 		if (nr_status == NR_EXPECTED) {
-			if ((skb_queue_len( &self->txq) > 0) &&
+			if (!skb_queue_empty(&self->txq) &&
 			    (self->window > 0)) {
 				self->remote_busy = FALSE;
 
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
index 040abe714aa3..6dafbb43b529 100644
--- a/net/irda/irlap_frame.c
+++ b/net/irda/irlap_frame.c
@@ -1018,11 +1018,10 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
 	/*
 	 *  We can now fill the window with additional data frames
 	 */
-	while (skb_queue_len( &self->txq) > 0) {
+	while (!skb_queue_empty(&self->txq)) {
 
 		IRDA_DEBUG(0, "%s(), sending additional frames!\n", __FUNCTION__);
-		if ((skb_queue_len( &self->txq) > 0) &&
-		    (self->window > 0)) {
+		if (self->window > 0) {
 			skb = skb_dequeue( &self->txq);
 			IRDA_ASSERT(skb != NULL, return;);
 
@@ -1031,8 +1030,7 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
 			 *  bit cleared
 			 */
 			if ((self->window > 1) &&
-			    skb_queue_len(&self->txq) > 0)
-			{
+			    !skb_queue_empty(&self->txq)) {
 				irlap_send_data_primary(self, skb);
 			} else {
 				irlap_send_data_primary_poll(self, skb);
diff --git a/net/irda/irttp.c b/net/irda/irttp.c
index d091ccf773b3..6602d901f8b1 100644
--- a/net/irda/irttp.c
+++ b/net/irda/irttp.c
@@ -1513,7 +1513,7 @@ int irttp_disconnect_request(struct tsap_cb *self, struct sk_buff *userdata,
 	/*
 	 *  Check if there is still data segments in the transmit queue
 	 */
-	if (skb_queue_len(&self->tx_queue) > 0) {
+	if (!skb_queue_empty(&self->tx_queue)) {
 		if (priority == P_HIGH) {
 			/*
 			 *  No need to send the queued data, if we are
diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c
index cd130c3b72bc..d5bdb53a348f 100644
--- a/net/llc/llc_c_ev.c
+++ b/net/llc/llc_c_ev.c
@@ -84,7 +84,7 @@ static u16 llc_util_nr_inside_tx_window(struct sock *sk, u8 nr)
 	if (llc->dev->flags & IFF_LOOPBACK)
 		goto out;
 	rc = 1;
-	if (!skb_queue_len(&llc->pdu_unack_q))
+	if (skb_queue_empty(&llc->pdu_unack_q))
 		goto out;
 	skb = skb_peek(&llc->pdu_unack_q);
 	pdu = llc_pdu_sn_hdr(skb);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index fc456a7aaec3..3405fdf41b93 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -858,7 +858,7 @@ static inline void netlink_rcv_wake(struct sock *sk)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
 
-	if (!skb_queue_len(&sk->sk_receive_queue))
+	if (skb_queue_empty(&sk->sk_receive_queue))
 		clear_bit(0, &nlk->state);
 	if (!test_bit(0, &nlk->state))
 		wake_up_interruptible(&nlk->wait);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 664d0e47374f..7845d045eec4 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -385,7 +385,7 @@ static int red_change(struct Qdisc *sch, struct rtattr *opt)
 	memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
 
 	q->qcount = -1;
-	if (skb_queue_len(&sch->q) == 0)
+	if (skb_queue_empty(&sch->q))
 		PSCHED_SET_PASTPERFECT(q->qidlestart);
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c420eba4876b..d403e34088ad 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -302,7 +302,7 @@ static void unix_write_space(struct sock *sk)
  * may receive messages only from that peer. */
 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 {
-	if (skb_queue_len(&sk->sk_receive_queue)) {
+	if (!skb_queue_empty(&sk->sk_receive_queue)) {
 		skb_queue_purge(&sk->sk_receive_queue);
 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 
@@ -1619,7 +1619,7 @@ static long unix_stream_data_wait(struct sock * sk, long timeo)
 	for (;;) {
 		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 
-		if (skb_queue_len(&sk->sk_receive_queue) ||
+		if (!skb_queue_empty(&sk->sk_receive_queue) ||
 		    sk->sk_err ||
 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
 		    signal_pending(current) ||
-- 
cgit 


From 86a76caf8705e3524e15f343f3c4806939a06dc8 Mon Sep 17 00:00:00 2001
From: Victor Fusco <victor@cetuc.puc-rio.br>
Date: Fri, 8 Jul 2005 14:57:47 -0700
Subject: [NET]: Fix sparse warnings

From: Victor Fusco <victor@cetuc.puc-rio.br>

Fix the sparse warning "implicit cast to nocast type"

Signed-off-by: Victor Fusco <victor@cetuc.puc-rio.br>
Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c        |  2 +-
 net/core/skbuff.c     | 17 ++++++++++-------
 net/core/sock.c       | 11 +++++++----
 net/ipv4/tcp_output.c |  2 +-
 4 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 7f5f62c65115..ff9dc029233a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1127,7 +1127,7 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 extern void skb_release_data(struct sk_buff *);
 
 /* Keep head the same: replace data */
-int __skb_linearize(struct sk_buff *skb, int gfp_mask)
+int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 {
 	unsigned int size;
 	u8 *data;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 733deee24b9f..d9f7b06fe886 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -129,7 +129,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
  *	Buffers may only be allocated from interrupts using a @gfp_mask of
  *	%GFP_ATOMIC.
  */
-struct sk_buff *alloc_skb(unsigned int size, int gfp_mask)
+struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask)
 {
 	struct sk_buff *skb;
 	u8 *data;
@@ -182,7 +182,8 @@ nodata:
  *	%GFP_ATOMIC.
  */
 struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
-				     unsigned int size, int gfp_mask)
+				     unsigned int size,
+				     unsigned int __nocast gfp_mask)
 {
 	struct sk_buff *skb;
 	u8 *data;
@@ -322,7 +323,7 @@ void __kfree_skb(struct sk_buff *skb)
  *	%GFP_ATOMIC.
  */
 
-struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
+struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 {
 	struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
 
@@ -460,7 +461,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
  *	header is going to be modified. Use pskb_copy() instead.
  */
 
-struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask)
+struct sk_buff *skb_copy(const struct sk_buff *skb, unsigned int __nocast gfp_mask)
 {
 	int headerlen = skb->data - skb->head;
 	/*
@@ -499,7 +500,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask)
  *	The returned buffer has a reference count of 1.
  */
 
-struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask)
+struct sk_buff *pskb_copy(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 {
 	/*
 	 *	Allocate the copy buffer
@@ -557,7 +558,8 @@ out:
  *	reloaded after call to this function.
  */
 
-int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask)
+int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
+		     unsigned int __nocast gfp_mask)
 {
 	int i;
 	u8 *data;
@@ -647,7 +649,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
  *	only by netfilter in the cases when checksum is recalculated? --ANK
  */
 struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
-				int newheadroom, int newtailroom, int gfp_mask)
+				int newheadroom, int newtailroom,
+				unsigned int __nocast gfp_mask)
 {
 	/*
 	 *	Allocate the copy buffer
diff --git a/net/core/sock.c b/net/core/sock.c
index a6ec3ada7f9e..8b35ccdc2b3b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -622,7 +622,8 @@ lenout:
  *	@prot: struct proto associated with this new sock instance
  *	@zero_it: if we should zero the newly allocated sock
  */
-struct sock *sk_alloc(int family, int priority, struct proto *prot, int zero_it)
+struct sock *sk_alloc(int family, unsigned int __nocast priority,
+		      struct proto *prot, int zero_it)
 {
 	struct sock *sk = NULL;
 	kmem_cache_t *slab = prot->slab;
@@ -750,7 +751,8 @@ unsigned long sock_i_ino(struct sock *sk)
 /*
  * Allocate a skb from the socket's send buffer.
  */
-struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority)
+struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
+			     unsigned int __nocast priority)
 {
 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
 		struct sk_buff * skb = alloc_skb(size, priority);
@@ -765,7 +767,8 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int
 /*
  * Allocate a skb from the socket's receive buffer.
  */ 
-struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority)
+struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
+			     unsigned int __nocast priority)
 {
 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
 		struct sk_buff *skb = alloc_skb(size, priority);
@@ -780,7 +783,7 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int
 /* 
  * Allocate a memory block from the socket's option memory buffer.
  */ 
-void *sock_kmalloc(struct sock *sk, int size, int priority)
+void *sock_kmalloc(struct sock *sk, int size, unsigned int __nocast priority)
 {
 	if ((unsigned)size <= sysctl_optmem_max &&
 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e041d057ec86..e3f8ea1bfa9c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1613,7 +1613,7 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk, int priority)
+void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-- 
cgit 


From 4c866aa798bc6de0a1d45495229e9f13c35b55c2 Mon Sep 17 00:00:00 2001
From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Date: Fri, 8 Jul 2005 17:34:46 -0700
Subject: [IPV4]: Apply sysctl_icmp_echo_ignore_broadcasts to ICMP_TIMESTAMP as
 well.

This was the full intention of the original code.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index cb759484979d..279f57abfecb 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -970,7 +970,8 @@ int icmp_rcv(struct sk_buff *skb)
 		 *	RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
 		 *	  discarded if to broadcast/multicast.
 		 */
-		if (icmph->type == ICMP_ECHO &&
+		if ((icmph->type == ICMP_ECHO ||
+		     icmph->type == ICMP_TIMESTAMP) &&
 		    sysctl_icmp_echo_ignore_broadcasts) {
 			goto error;
 		}
-- 
cgit 


From ca9b907d140a5f249250d19f956129dbbbf84f73 Mon Sep 17 00:00:00 2001
From: David L Stevens <dlstevens@us.ibm.com>
Date: Fri, 8 Jul 2005 17:38:07 -0700
Subject: [IPV4]: multicast API "join" issues

        This patch corrects a few problems with the IP_ADD_MEMBERSHIP
socket option:

1) The existing code makes an attempt at reference counting joins when
   using the ip_mreqn/imr_ifindex interface. Joining the same group
   on the same socket is an error, whatever the API. This leads to
   unexpected results when mixing ip_mreqn by index with ip_mreqn by
   address, ip_mreq, or other API's. For example, ip_mreq followed by
   ip_mreqn of the same group will "work" while the same two reversed
   will not.
           Fixed to always return EADDRINUSE on a duplicate join and
   removed the (now unused) reference count in ip_mc_socklist.

2) The group-search list in ip_mc_join_group() is comparing a full
   ip_mreqn structure and all of it must match for it to find the
   group. This doesn't correctly match a group that was joined with
   ip_mreq or ip_mreqn with an address (with or without an index). It
   also doesn't match groups that are joined by different addresses on
   the same interface. All of these are the same multicast group,
   which is identified by group address and interface index.
           Fixed the check to correctly match groups so we don't get
   duplicate group entries on the ip_mc_socklist.

3) The old code allocates a multicast address before searching for
   duplicates requiring it to free in various error cases. This
   patch moves the allocate until after the search and
   igmp_max_memberships check, so never a need to allocate, then free
   an entry.

Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 35 ++++++++++++-----------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 1f3183168a90..111eb678cbac 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1615,9 +1615,10 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
 {
 	int err;
 	u32 addr = imr->imr_multiaddr.s_addr;
-	struct ip_mc_socklist *iml, *i;
+	struct ip_mc_socklist *iml=NULL, *i;
 	struct in_device *in_dev;
 	struct inet_sock *inet = inet_sk(sk);
+	int ifindex;
 	int count = 0;
 
 	if (!MULTICAST(addr))
@@ -1633,37 +1634,30 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
 		goto done;
 	}
 
-	iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
-
 	err = -EADDRINUSE;
+	ifindex = imr->imr_ifindex;
 	for (i = inet->mc_list; i; i = i->next) {
-		if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) {
-			/* New style additions are reference counted */
-			if (imr->imr_address.s_addr == 0) {
-				i->count++;
-				err = 0;
-			}
+		if (i->multi.imr_multiaddr.s_addr == addr &&
+		    i->multi.imr_ifindex == ifindex)
 			goto done;
-		}
 		count++;
 	}
 	err = -ENOBUFS;
-	if (iml == NULL || count >= sysctl_igmp_max_memberships)
+	if (count >= sysctl_igmp_max_memberships)
+		goto done;
+	iml = (struct ip_mc_socklist *)sock_kmalloc(sk,sizeof(*iml),GFP_KERNEL);
+	if (iml == NULL)
 		goto done;
+
 	memcpy(&iml->multi, imr, sizeof(*imr));
 	iml->next = inet->mc_list;
-	iml->count = 1;
 	iml->sflist = NULL;
 	iml->sfmode = MCAST_EXCLUDE;
 	inet->mc_list = iml;
 	ip_mc_inc_group(in_dev, addr);
-	iml = NULL;
 	err = 0;
-
 done:
 	rtnl_shunlock();
-	if (iml)
-		sock_kfree_s(sk, iml, sizeof(*iml));
 	return err;
 }
 
@@ -1704,12 +1698,6 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
 			in_dev = inetdev_by_index(iml->multi.imr_ifindex);
 			if (in_dev)
 				(void) ip_mc_leave_src(sk, iml, in_dev);
-			if (--iml->count) {
-				rtnl_unlock();
-				if (in_dev)
-					in_dev_put(in_dev);
-				return 0;
-			}
 
 			*imlp = iml->next;
 
@@ -1755,7 +1743,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 	err = -EADDRNOTAVAIL;
 
 	for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
-		if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0)
+		if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr
+		    && pmc->multi.imr_ifindex == imr.imr_ifindex)
 			break;
 	}
 	if (!pmc)		/* must have a prior join */
-- 
cgit 


From 8cdaaa15da58806ac3c75d96c40aef9e31445a25 Mon Sep 17 00:00:00 2001
From: David L Stevens <dlstevens@us.ibm.com>
Date: Fri, 8 Jul 2005 17:39:23 -0700
Subject: [IPV4]: multicast API "join" issues

1) Changes IP_ADD_SOURCE_MEMBERSHIP and MCAST_JOIN_SOURCE_GROUP to ignore
   EADDRINUSE errors on a "courtesy join" -- prior membership or not
   is ok for these.

2) Adds "leave group" equivalence of (INCLUDE, empty) filters in the
   delta-based API. Without this, mixing delta-based API calls that
   end in an (INCLUDE, empty) filter would not allow a subsequent
   regular IP_ADD_MEMBERSHIP. It also frees socket buffer memory that
   isn't needed for both the multicast group record and source filter.

Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c        | 9 +++++++++
 net/ipv4/ip_sockglue.c | 6 +++---
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 111eb678cbac..e21ebe759907 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1724,6 +1724,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 	struct in_device *in_dev = NULL;
 	struct inet_sock *inet = inet_sk(sk);
 	struct ip_sf_socklist *psl;
+	int leavegroup = 0;
 	int i, j, rv;
 
 	if (!MULTICAST(addr))
@@ -1775,6 +1776,12 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 		if (rv)		/* source not found */
 			goto done;
 
+		/* special case - (INCLUDE, empty) == LEAVE_GROUP */
+		if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
+			leavegroup = 1;
+			goto done;
+		}
+
 		/* update the interface filter */
 		ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, 
 			&mreqs->imr_sourceaddr, 1);
@@ -1831,6 +1838,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 		&mreqs->imr_sourceaddr, 1);
 done:
 	rtnl_shunlock();
+	if (leavegroup)
+		return ip_mc_leave_group(sk, &imr);
 	return err;
 }
 
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index f8b172f89811..fc7c481d0d79 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -677,11 +677,11 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 				mreq.imr_address.s_addr = mreqs.imr_interface;
 				mreq.imr_ifindex = 0;
 				err = ip_mc_join_group(sk, &mreq);
-				if (err)
+				if (err && err != -EADDRINUSE)
 					break;
 				omode = MCAST_INCLUDE;
 				add = 1;
-			} else /*IP_DROP_SOURCE_MEMBERSHIP */ {
+			} else /* IP_DROP_SOURCE_MEMBERSHIP */ {
 				omode = MCAST_INCLUDE;
 				add = 0;
 			}
@@ -754,7 +754,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 				mreq.imr_address.s_addr = 0;
 				mreq.imr_ifindex = greqs.gsr_interface;
 				err = ip_mc_join_group(sk, &mreq);
-				if (err)
+				if (err && err != -EADDRINUSE)
 					break;
 				greqs.gsr_interface = mreq.imr_ifindex;
 				omode = MCAST_INCLUDE;
-- 
cgit 


From 917f2f105ea4bbba8604e3ed55233eebda7afe6a Mon Sep 17 00:00:00 2001
From: David L Stevens <dlstevens@us.ibm.com>
Date: Fri, 8 Jul 2005 17:45:16 -0700
Subject: [IPV4]: multicast API "join" issues

1) In the full-state API when imsf_numsrc == 0
   errno should be "0", but returns EADDRNOTAVAIL

2) An illegal filter mode change
   errno should be EINVAL, but returns EADDRNOTAVAIL

3) Trying to do an any-source option without IP_ADD_MEMBERSHIP
   errno should be EINVAL, but returns EADDRNOTAVAIL

4) Adds comments for the less obvious error return values

Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c  | 18 ++++++++++++------
 net/ipv6/mcast.c | 18 ++++++++++++------
 2 files changed, 24 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index e21ebe759907..dbbfa09de4e8 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1748,12 +1748,16 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 		    && pmc->multi.imr_ifindex == imr.imr_ifindex)
 			break;
 	}
-	if (!pmc)		/* must have a prior join */
+	if (!pmc) {		/* must have a prior join */
+		err = -EINVAL;
 		goto done;
+	}
 	/* if a source filter was set, must be the same mode as before */
 	if (pmc->sflist) {
-		if (pmc->sfmode != omode)
+		if (pmc->sfmode != omode) {
+			err = -EINVAL;
 			goto done;
+		}
 	} else if (pmc->sfmode != omode) {
 		/* allow mode switches for empty-set filters */
 		ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
@@ -1765,7 +1769,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 	psl = pmc->sflist;
 	if (!add) {
 		if (!psl)
-			goto done;
+			goto done;	/* err = -EADDRNOTAVAIL */
 		rv = !0;
 		for (i=0; i<psl->sl_count; i++) {
 			rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
@@ -1774,7 +1778,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 				break;
 		}
 		if (rv)		/* source not found */
-			goto done;
+			goto done;	/* err = -EADDRNOTAVAIL */
 
 		/* special case - (INCLUDE, empty) == LEAVE_GROUP */
 		if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
@@ -1870,15 +1874,16 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
 		err = -ENODEV;
 		goto done;
 	}
-	err = -EADDRNOTAVAIL;
 
 	for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
 		if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
 		    pmc->multi.imr_ifindex == imr.imr_ifindex)
 			break;
 	}
-	if (!pmc)		/* must have a prior join */
+	if (!pmc) {		/* must have a prior join */
+		err = -EINVAL;
 		goto done;
+	}
 	if (msf->imsf_numsrc) {
 		newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
 				IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL);
@@ -1907,6 +1912,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
 			0, NULL, 0);
 	pmc->sflist = newpsl;
 	pmc->sfmode = msf->imsf_fmode;
+	err = 0;
 done:
 	rtnl_shunlock();
 	return err;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 562fcd14fdea..9db4581d0d79 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -386,12 +386,16 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
 		if (ipv6_addr_equal(&pmc->addr, group))
 			break;
 	}
-	if (!pmc)		/* must have a prior join */
+	if (!pmc) {		/* must have a prior join */
+		err = -EINVAL;
 		goto done;
+	}
 	/* if a source filter was set, must be the same mode as before */
 	if (pmc->sflist) {
-		if (pmc->sfmode != omode)
+		if (pmc->sfmode != omode) {
+			err = -EINVAL;
 			goto done;
+		}
 	} else if (pmc->sfmode != omode) {
 		/* allow mode switches for empty-set filters */
 		ip6_mc_add_src(idev, group, omode, 0, NULL, 0);
@@ -402,7 +406,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
 	psl = pmc->sflist;
 	if (!add) {
 		if (!psl)
-			goto done;
+			goto done;	/* err = -EADDRNOTAVAIL */
 		rv = !0;
 		for (i=0; i<psl->sl_count; i++) {
 			rv = memcmp(&psl->sl_addr[i], source,
@@ -411,7 +415,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
 				break;
 		}
 		if (rv)		/* source not found */
-			goto done;
+			goto done;	/* err = -EADDRNOTAVAIL */
 
 		/* special case - (INCLUDE, empty) == LEAVE_GROUP */
 		if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
@@ -503,7 +507,6 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
 	if (!idev)
 		return -ENODEV;
 	dev = idev->dev;
-	err = -EADDRNOTAVAIL;
 
 	for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
 		if (pmc->ifindex != gsf->gf_interface)
@@ -511,8 +514,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
 		if (ipv6_addr_equal(&pmc->addr, group))
 			break;
 	}
-	if (!pmc)		/* must have a prior join */
+	if (!pmc) {		/* must have a prior join */
+		err = -EINVAL;
 		goto done;
+	}
 	if (gsf->gf_numsrc) {
 		newpsl = (struct ip6_sf_socklist *)sock_kmalloc(sk,
 				IP6_SFLSIZE(gsf->gf_numsrc), GFP_ATOMIC);
@@ -544,6 +549,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
 		(void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
 	pmc->sflist = newpsl;
 	pmc->sfmode = gsf->gf_fmode;
+	err = 0;
 done:
 	read_unlock_bh(&idev->lock);
 	in6_dev_put(idev);
-- 
cgit 


From 9951f036fe8a4e6b21962559c64ff13b290ff01a Mon Sep 17 00:00:00 2001
From: David L Stevens <dlstevens@us.ibm.com>
Date: Fri, 8 Jul 2005 17:47:28 -0700
Subject: [IPV4]: (INCLUDE,empty)/leave-group equivalence for full-state MSF
 APIs & errno fix

1) Adds (INCLUDE, empty)/leave-group equivalence to the full-state
   multicast source filter APIs (IPv4 and IPv6)

2) Fixes an incorrect errno in the IPv6 leave-group (ENOENT should be
   EADDRNOTAVAIL)

Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c  | 11 ++++++++++-
 net/ipv6/mcast.c | 10 +++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index dbbfa09de4e8..7af3146939dc 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1849,13 +1849,14 @@ done:
 
 int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
 {
-	int err;
+	int err = 0;
 	struct ip_mreqn	imr;
 	u32 addr = msf->imsf_multiaddr;
 	struct ip_mc_socklist *pmc;
 	struct in_device *in_dev;
 	struct inet_sock *inet = inet_sk(sk);
 	struct ip_sf_socklist *newpsl, *psl;
+	int leavegroup = 0;
 
 	if (!MULTICAST(addr))
 		return -EINVAL;
@@ -1875,6 +1876,12 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
 		goto done;
 	}
 
+	/* special case - (INCLUDE, empty) == LEAVE_GROUP */
+	if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
+		leavegroup = 1;
+		goto done;
+	}
+
 	for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
 		if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
 		    pmc->multi.imr_ifindex == imr.imr_ifindex)
@@ -1915,6 +1922,8 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
 	err = 0;
 done:
 	rtnl_shunlock();
+	if (leavegroup)
+		err = ip_mc_leave_group(sk, &imr);
 	return err;
 }
 
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 9db4581d0d79..398c982625f8 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -281,7 +281,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr)
 	}
 	write_unlock_bh(&ipv6_sk_mc_lock);
 
-	return -ENOENT;
+	return -EADDRNOTAVAIL;
 }
 
 static struct inet6_dev *ip6_mc_find_dev(struct in6_addr *group, int ifindex)
@@ -492,6 +492,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
 	struct inet6_dev *idev;
 	struct ipv6_pinfo *inet6 = inet6_sk(sk);
 	struct ip6_sf_socklist *newpsl, *psl;
+	int leavegroup = 0;
 	int i, err;
 
 	group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;
@@ -508,6 +509,11 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
 		return -ENODEV;
 	dev = idev->dev;
 
+	if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) {
+		leavegroup = 1;
+		goto done;
+	}
+
 	for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
 		if (pmc->ifindex != gsf->gf_interface)
 			continue;
@@ -554,6 +560,8 @@ done:
 	read_unlock_bh(&idev->lock);
 	in6_dev_put(idev);
 	dev_put(dev);
+	if (leavegroup)
+		err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group);
 	return err;
 }
 
-- 
cgit 


From 84b42baef775b0e3415ccece17cf694f50326d01 Mon Sep 17 00:00:00 2001
From: David L Stevens <dlstevens@us.ibm.com>
Date: Fri, 8 Jul 2005 17:48:38 -0700
Subject: [IPV4]: fix IPv4 leave-group group matching

        This patch fixes the multicast group matching for
IP_DROP_MEMBERSHIP, similar to the IP_ADD_MEMBERSHIP fix in a prior
patch. Groups are identifiedby <group address,interface> and including
the interface address in the match will fail if a leave-group is done
by address when the join was done by index, or if different addresses
on the same interface are used in the join and leave.

Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 7af3146939dc..5088f90835ae 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1687,24 +1687,25 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct ip_mc_socklist *iml, **imlp;
+	struct in_device *in_dev;
+	u32 group = imr->imr_multiaddr.s_addr;
+	u32 ifindex;
 
 	rtnl_lock();
+	in_dev = ip_mc_find_dev(imr);
+	if (!in_dev) {
+		rtnl_unlock();
+		return -ENODEV;
+	}
+	ifindex = imr->imr_ifindex;
 	for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
-		if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr &&
-		    iml->multi.imr_address.s_addr==imr->imr_address.s_addr &&
-		    (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) {
-			struct in_device *in_dev;
-
-			in_dev = inetdev_by_index(iml->multi.imr_ifindex);
-			if (in_dev)
-				(void) ip_mc_leave_src(sk, iml, in_dev);
+		if (iml->multi.imr_multiaddr.s_addr == group &&
+		    iml->multi.imr_ifindex == ifindex) {
+			(void) ip_mc_leave_src(sk, iml, in_dev);
 
 			*imlp = iml->next;
 
-			if (in_dev) {
-				ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
-				in_dev_put(in_dev);
-			}
+			ip_mc_dec_group(in_dev, group);
 			rtnl_unlock();
 			sock_kfree_s(sk, iml, sizeof(*iml));
 			return 0;
-- 
cgit 


From 9c05989bb2264f0fa4fc95f81d2c4e6aa2eaa24d Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 8 Jul 2005 21:44:39 -0700
Subject: [IPV6]: Fix warning in ip6_mc_msfilter.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 398c982625f8..29fed6e58d0a 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -509,6 +509,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
 		return -ENODEV;
 	dev = idev->dev;
 
+	err = 0;
 	if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) {
 		leavegroup = 1;
 		goto done;
-- 
cgit 


From 79af02c2538d54ff0dcd3f43646f506207f2ee62 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 8 Jul 2005 21:47:49 -0700
Subject: [SCTP]: Use struct list_head for chunk lists, not sk_buff_head.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/associola.c     |  2 +-
 net/sctp/input.c         | 26 +++++++++++++++----------
 net/sctp/inqueue.c       | 18 +++++++++++------
 net/sctp/output.c        | 22 ++++++++++++---------
 net/sctp/outqueue.c      | 50 +++++++++++++++++++++++++++---------------------
 net/sctp/sm_make_chunk.c | 12 ++++++++----
 net/sctp/socket.c        |  2 +-
 7 files changed, 79 insertions(+), 53 deletions(-)

(limited to 'net')

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 7ae6aa772dab..4b47dd6f2485 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -203,7 +203,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 */
 	asoc->addip_serial = asoc->c.initial_tsn;
 
-	skb_queue_head_init(&asoc->addip_chunks);
+	INIT_LIST_HEAD(&asoc->addip_chunk_list);
 
 	/* Make an empty list of remote transport addresses.  */
 	INIT_LIST_HEAD(&asoc->peer.transport_addr_list);
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 339f7acfdb64..5e085e041a6e 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -115,6 +115,17 @@ static void sctp_rcv_set_owner_r(struct sk_buff *skb, struct sock *sk)
 	atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc);
 }
 
+struct sctp_input_cb {
+	union {
+		struct inet_skb_parm	h4;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+		struct inet6_skb_parm	h6;
+#endif
+	} header;
+	struct sctp_chunk *chunk;
+};
+#define SCTP_INPUT_CB(__skb)	((struct sctp_input_cb *)&((__skb)->cb[0]))
+
 /*
  * This is the routine which IP calls when receiving an SCTP packet.
  */
@@ -243,6 +254,7 @@ int sctp_rcv(struct sk_buff *skb)
 		ret = -ENOMEM;
 		goto discard_release;
 	}
+	SCTP_INPUT_CB(skb)->chunk = chunk;
 
 	sctp_rcv_set_owner_r(skb,sk);
 
@@ -265,9 +277,9 @@ int sctp_rcv(struct sk_buff *skb)
 	sctp_bh_lock_sock(sk);
 
 	if (sock_owned_by_user(sk))
-		sk_add_backlog(sk, (struct sk_buff *) chunk);
+		sk_add_backlog(sk, skb);
 	else
-		sctp_backlog_rcv(sk, (struct sk_buff *) chunk);
+		sctp_backlog_rcv(sk, skb);
 
 	/* Release the sock and any reference counts we took in the
 	 * lookup calls.
@@ -302,14 +314,8 @@ discard_release:
  */
 int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 {
-	struct sctp_chunk *chunk;
-	struct sctp_inq *inqueue;
-
-	/* One day chunk will live inside the skb, but for
-	 * now this works.
-	 */
-	chunk = (struct sctp_chunk *) skb;
-	inqueue = &chunk->rcvr->inqueue;
+	struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
+	struct sctp_inq *inqueue = &chunk->rcvr->inqueue;
 
 	sctp_inq_push(inqueue, chunk);
         return 0;
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index cedf4351556c..2d33922c044b 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -50,7 +50,7 @@
 /* Initialize an SCTP inqueue.  */
 void sctp_inq_init(struct sctp_inq *queue)
 {
-	skb_queue_head_init(&queue->in);
+	INIT_LIST_HEAD(&queue->in_chunk_list);
 	queue->in_progress = NULL;
 
 	/* Create a task for delivering data.  */
@@ -62,11 +62,13 @@ void sctp_inq_init(struct sctp_inq *queue)
 /* Release the memory associated with an SCTP inqueue.  */
 void sctp_inq_free(struct sctp_inq *queue)
 {
-	struct sctp_chunk *chunk;
+	struct sctp_chunk *chunk, *tmp;
 
 	/* Empty the queue.  */
-	while ((chunk = (struct sctp_chunk *) skb_dequeue(&queue->in)) != NULL)
+	list_for_each_entry_safe(chunk, tmp, &queue->in_chunk_list, list) {
+		list_del_init(&chunk->list);
 		sctp_chunk_free(chunk);
+	}
 
 	/* If there is a packet which is currently being worked on,
 	 * free it as well.
@@ -92,7 +94,7 @@ void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *packet)
 	 * Eventually, we should clean up inqueue to not rely
 	 * on the BH related data structures.
 	 */
-	skb_queue_tail(&(q->in), (struct sk_buff *) packet);
+	list_add_tail(&packet->list, &q->in_chunk_list);
 	q->immediate.func(q->immediate.data);
 }
 
@@ -131,12 +133,16 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue)
 
 	/* Do we need to take the next packet out of the queue to process? */
 	if (!chunk) {
+		struct list_head *entry;
+
 		/* Is the queue empty?  */
-        	if (skb_queue_empty(&queue->in))
+		if (list_empty(&queue->in_chunk_list))
 			return NULL;
 
+		entry = queue->in_chunk_list.next;
 		chunk = queue->in_progress =
-			(struct sctp_chunk *) skb_dequeue(&queue->in);
+			list_entry(entry, struct sctp_chunk, list);
+		list_del_init(entry);
 
 		/* This is the first chunk in the packet.  */
 		chunk->singleton = 1;
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 84b5b370b09d..931371633464 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -108,7 +108,7 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
 	packet->transport = transport;
 	packet->source_port = sport;
 	packet->destination_port = dport;
-	skb_queue_head_init(&packet->chunks);
+	INIT_LIST_HEAD(&packet->chunk_list);
 	if (asoc) {
 		struct sctp_sock *sp = sctp_sk(asoc->base.sk);	
 		overhead = sp->pf->af->net_header_len; 
@@ -129,12 +129,14 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
 /* Free a packet.  */
 void sctp_packet_free(struct sctp_packet *packet)
 {
-	struct sctp_chunk *chunk;
+	struct sctp_chunk *chunk, *tmp;
 
 	SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet);
 
-        while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL)
+	list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+		list_del_init(&chunk->list);
 		sctp_chunk_free(chunk);
+	}
 
 	if (packet->malloced)
 		kfree(packet);
@@ -276,7 +278,7 @@ append:
 		packet->has_sack = 1;
 
 	/* It is OK to send this chunk.  */
-	__skb_queue_tail(&packet->chunks, (struct sk_buff *)chunk);
+	list_add_tail(&chunk->list, &packet->chunk_list);
 	packet->size += chunk_len;
 	chunk->transport = packet->transport;
 finish:
@@ -295,7 +297,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 	struct sctphdr *sh;
 	__u32 crc32;
 	struct sk_buff *nskb;
-	struct sctp_chunk *chunk;
+	struct sctp_chunk *chunk, *tmp;
 	struct sock *sk;
 	int err = 0;
 	int padding;		/* How much padding do we need?  */
@@ -305,11 +307,11 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 	SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet);
 
 	/* Do NOT generate a chunkless packet. */
-	chunk = (struct sctp_chunk *)skb_peek(&packet->chunks);
-	if (unlikely(!chunk))
+	if (list_empty(&packet->chunk_list))
 		return err;
 
 	/* Set up convenience variables... */
+	chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
 	sk = chunk->skb->sk;
 
 	/* Allocate the new skb.  */
@@ -370,7 +372,8 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 	 * [This whole comment explains WORD_ROUND() below.]
 	 */
 	SCTP_DEBUG_PRINTK("***sctp_transmit_packet***\n");
-	while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) {
+	list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+		list_del_init(&chunk->list);
 		if (sctp_chunk_is_data(chunk)) {
 
 			if (!chunk->has_tsn) {
@@ -511,7 +514,8 @@ err:
 	 * will get resent or dropped later.
 	 */
 
-	while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) {
+	list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+		list_del_init(&chunk->list);
 		if (!sctp_chunk_is_data(chunk))
     			sctp_chunk_free(chunk);
 	}
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 4eb81a1407b7..efb72faba20c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -75,7 +75,7 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
 static inline void sctp_outq_head_data(struct sctp_outq *q,
 					struct sctp_chunk *ch)
 {
-	__skb_queue_head(&q->out, (struct sk_buff *)ch);
+	list_add(&ch->list, &q->out_chunk_list);
 	q->out_qlen += ch->skb->len;
 	return;
 }
@@ -83,17 +83,22 @@ static inline void sctp_outq_head_data(struct sctp_outq *q,
 /* Take data from the front of the queue. */
 static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q)
 {
-	struct sctp_chunk *ch;
-	ch = (struct sctp_chunk *)__skb_dequeue(&q->out);
-	if (ch)
+	struct sctp_chunk *ch = NULL;
+
+	if (!list_empty(&q->out_chunk_list)) {
+		struct list_head *entry = q->out_chunk_list.next;
+
+		ch = list_entry(entry, struct sctp_chunk, list);
+		list_del_init(entry);
 		q->out_qlen -= ch->skb->len;
+	}
 	return ch;
 }
 /* Add data chunk to the end of the queue. */
 static inline void sctp_outq_tail_data(struct sctp_outq *q,
 				       struct sctp_chunk *ch)
 {
-	__skb_queue_tail(&q->out, (struct sk_buff *)ch);
+	list_add_tail(&ch->list, &q->out_chunk_list);
 	q->out_qlen += ch->skb->len;
 	return;
 }
@@ -197,8 +202,8 @@ static inline int sctp_cacc_skip(struct sctp_transport *primary,
 void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q)
 {
 	q->asoc = asoc;
-	skb_queue_head_init(&q->out);
-	skb_queue_head_init(&q->control);
+	INIT_LIST_HEAD(&q->out_chunk_list);
+	INIT_LIST_HEAD(&q->control_chunk_list);
 	INIT_LIST_HEAD(&q->retransmit);
 	INIT_LIST_HEAD(&q->sacked);
 	INIT_LIST_HEAD(&q->abandoned);
@@ -217,7 +222,7 @@ void sctp_outq_teardown(struct sctp_outq *q)
 {
 	struct sctp_transport *transport;
 	struct list_head *lchunk, *pos, *temp;
-	struct sctp_chunk *chunk;
+	struct sctp_chunk *chunk, *tmp;
 
 	/* Throw away unacknowledged chunks. */
 	list_for_each(pos, &q->asoc->peer.transport_addr_list) {
@@ -269,8 +274,10 @@ void sctp_outq_teardown(struct sctp_outq *q)
 	q->error = 0;
 
 	/* Throw away any leftover control chunks. */
-	while ((chunk = (struct sctp_chunk *) skb_dequeue(&q->control)) != NULL)
+	list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
+		list_del_init(&chunk->list);
 		sctp_chunk_free(chunk);
+	}
 }
 
 /* Free the outqueue structure and any related pending chunks.  */
@@ -333,7 +340,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
 			break;
 		};
 	} else {
-		__skb_queue_tail(&q->control, (struct sk_buff *) chunk);
+		list_add_tail(&chunk->list, &q->control_chunk_list);
 		SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
 	}
 
@@ -650,10 +657,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 	__u16 sport = asoc->base.bind_addr.port;
 	__u16 dport = asoc->peer.port;
 	__u32 vtag = asoc->peer.i.init_tag;
-	struct sk_buff_head *queue;
 	struct sctp_transport *transport = NULL;
 	struct sctp_transport *new_transport;
-	struct sctp_chunk *chunk;
+	struct sctp_chunk *chunk, *tmp;
 	sctp_xmit_t status;
 	int error = 0;
 	int start_timer = 0;
@@ -675,8 +681,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 	 *   ...
 	 */
 
-	queue = &q->control;
-	while ((chunk = (struct sctp_chunk *)skb_dequeue(queue)) != NULL) {
+	list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
+		list_del_init(&chunk->list);
+
 		/* Pick the right transport to use. */
 		new_transport = chunk->transport;
 
@@ -814,8 +821,6 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 
 		/* Finally, transmit new packets.  */
 		start_timer = 0;
-		queue = &q->out;
-
 		while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
 			/* RFC 2960 6.5 Every DATA chunk MUST carry a valid
 			 * stream identifier.
@@ -1149,8 +1154,9 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack)
 	/* See if all chunks are acked.
 	 * Make sure the empty queue handler will get run later.
 	 */
-	q->empty = skb_queue_empty(&q->out) && skb_queue_empty(&q->control) &&
-			list_empty(&q->retransmit);
+	q->empty = (list_empty(&q->out_chunk_list) &&
+		    list_empty(&q->control_chunk_list) &&
+		    list_empty(&q->retransmit));
 	if (!q->empty)
 		goto finish;
 
@@ -1679,9 +1685,9 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
 		if (TSN_lte(tsn, ctsn)) {
 			list_del_init(lchunk);
 			if (!chunk->tsn_gap_acked) {
-			chunk->transport->flight_size -=
-						 sctp_data_size(chunk);
-			q->outstanding_bytes -= sctp_data_size(chunk);
+				chunk->transport->flight_size -=
+					sctp_data_size(chunk);
+				q->outstanding_bytes -= sctp_data_size(chunk);
 			}
 			sctp_chunk_free(chunk);
 		} else {
@@ -1729,7 +1735,7 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
 					      nskips, &ftsn_skip_arr[0]); 
 
 	if (ftsn_chunk) {
-		__skb_queue_tail(&q->control, (struct sk_buff *)ftsn_chunk);
+		list_add_tail(&ftsn_chunk->list, &q->control_chunk_list);
 		SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
 	}
 }
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 5baed9bb7de5..773cd93fa3d0 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1003,6 +1003,7 @@ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb,
 		SCTP_DEBUG_PRINTK("chunkifying skb %p w/o an sk\n", skb);
 	}
 
+	INIT_LIST_HEAD(&retval->list);
 	retval->skb		= skb;
 	retval->asoc		= (struct sctp_association *)asoc;
 	retval->resent  	= 0;
@@ -1116,8 +1117,7 @@ static void sctp_chunk_destroy(struct sctp_chunk *chunk)
 /* Possibly, free the chunk.  */
 void sctp_chunk_free(struct sctp_chunk *chunk)
 {
-	/* Make sure that we are not on any list.  */
-	skb_unlink((struct sk_buff *) chunk);
+	BUG_ON(!list_empty(&chunk->list));
 	list_del_init(&chunk->transmitted_list);
 
 	/* Release our reference on the message tracker. */
@@ -2739,8 +2739,12 @@ int sctp_process_asconf_ack(struct sctp_association *asoc,
 	asoc->addip_last_asconf = NULL;
 
 	/* Send the next asconf chunk from the addip chunk queue. */
-	asconf = (struct sctp_chunk *)__skb_dequeue(&asoc->addip_chunks);
-	if (asconf) {
+	if (!list_empty(&asoc->addip_chunk_list)) {
+		struct list_head *entry = asoc->addip_chunk_list.next;
+		asconf = list_entry(entry, struct sctp_chunk, list);
+
+		list_del_init(entry);
+
 		/* Hold the chunk until an ASCONF_ACK is received. */
 		sctp_chunk_hold(asconf);
 		if (sctp_primitive_ASCONF(asoc, asconf))
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index aad55dc3792b..091a66f06a35 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -406,7 +406,7 @@ static int sctp_send_asconf(struct sctp_association *asoc,
 	 * transmission.
 	 */	
 	if (asoc->addip_last_asconf) {
-		__skb_queue_tail(&asoc->addip_chunks, (struct sk_buff *)chunk);
+		list_add_tail(&chunk->list, &asoc->addip_chunk_list);
 		goto out;	
 	}
 
-- 
cgit 


From 3182cd84f0e132558bbe106c070405ae49f1f0e3 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 11 Jul 2005 20:57:47 -0700
Subject: [SCTP]: __nocast annotations

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/associola.c     | 13 ++++++++-----
 net/sctp/bind_addr.c     | 16 ++++++++++------
 net/sctp/chunk.c         |  2 +-
 net/sctp/endpointola.c   |  6 ++++--
 net/sctp/protocol.c      |  2 +-
 net/sctp/sm_make_chunk.c | 15 ++++++++-------
 net/sctp/sm_sideeffect.c | 13 +++++++------
 net/sctp/ssnmap.c        |  3 ++-
 net/sctp/transport.c     |  5 +++--
 net/sctp/ulpevent.c      | 19 ++++++++++---------
 net/sctp/ulpqueue.c      |  9 +++++----
 11 files changed, 59 insertions(+), 44 deletions(-)

(limited to 'net')

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 4b47dd6f2485..5b24ae0650d3 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -71,7 +71,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 					  const struct sctp_endpoint *ep,
 					  const struct sock *sk,
 					  sctp_scope_t scope,
-					  int gfp)
+					  unsigned int __nocast gfp)
 {
 	struct sctp_sock *sp;
 	int i;
@@ -272,7 +272,8 @@ fail_init:
 /* Allocate and initialize a new association */
 struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
 					 const struct sock *sk,
-					 sctp_scope_t scope, int gfp)
+					 sctp_scope_t scope,
+					 unsigned int __nocast gfp)
 {
 	struct sctp_association *asoc;
 
@@ -478,7 +479,7 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
 /* Add a transport address to an association.  */
 struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 					   const union sctp_addr *addr,
-					   const int gfp,
+					   const unsigned int __nocast gfp,
 					   const int peer_state)
 {
 	struct sctp_transport *peer;
@@ -1229,7 +1230,8 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned len)
 /* Build the bind address list for the association based on info from the
  * local endpoint and the remote peer.
  */
-int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, int gfp)
+int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
+				     unsigned int __nocast gfp)
 {
 	sctp_scope_t scope;
 	int flags;
@@ -1251,7 +1253,8 @@ int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, int gfp)
 
 /* Build the association's bind address list from the cookie.  */
 int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc,
-					 struct sctp_cookie *cookie, int gfp)
+					 struct sctp_cookie *cookie,
+					 unsigned int __nocast gfp)
 {
 	int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length);
 	int var_size3 = cookie->raw_addr_list_len;
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index f90eadfb60a2..f71549710f2e 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -53,7 +53,8 @@
 
 /* Forward declarations for internal helpers. */
 static int sctp_copy_one_addr(struct sctp_bind_addr *, union sctp_addr *,
-			      sctp_scope_t scope, int gfp, int flags);
+			      sctp_scope_t scope, unsigned int __nocast gfp,
+			      int flags);
 static void sctp_bind_addr_clean(struct sctp_bind_addr *);
 
 /* First Level Abstractions. */
@@ -63,7 +64,8 @@ static void sctp_bind_addr_clean(struct sctp_bind_addr *);
  */
 int sctp_bind_addr_copy(struct sctp_bind_addr *dest, 
 			const struct sctp_bind_addr *src,
-			sctp_scope_t scope, int gfp, int flags)
+			sctp_scope_t scope, unsigned int __nocast gfp,
+			int flags)
 {
 	struct sctp_sockaddr_entry *addr;
 	struct list_head *pos;
@@ -144,7 +146,7 @@ void sctp_bind_addr_free(struct sctp_bind_addr *bp)
 
 /* Add an address to the bind address list in the SCTP_bind_addr structure. */
 int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
-		       int gfp)
+		       unsigned int __nocast gfp)
 {
 	struct sctp_sockaddr_entry *addr;
 
@@ -197,7 +199,8 @@ int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr)
  * The second argument is the return value for the length.
  */
 union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp,
-					 int *addrs_len, int gfp)
+					 int *addrs_len,
+					 unsigned int __nocast gfp)
 {
 	union sctp_params addrparms;
 	union sctp_params retval;
@@ -249,7 +252,7 @@ end_raw:
  * address parameters).
  */
 int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
-			   int addrs_len, __u16 port, int gfp)
+			   int addrs_len, __u16 port, unsigned int __nocast gfp)
 {
 	union sctp_addr_param *rawaddr;
 	struct sctp_paramhdr *param;
@@ -347,7 +350,8 @@ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr	*bp,
 /* Copy out addresses from the global local address list. */
 static int sctp_copy_one_addr(struct sctp_bind_addr *dest, 
 			      union sctp_addr *addr,
-			      sctp_scope_t scope, int gfp, int flags)
+			      sctp_scope_t scope, unsigned int __nocast gfp,
+			      int flags)
 {
 	int error = 0;
 
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 0c2ab7885058..61da2937e641 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -62,7 +62,7 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg)
 }
 
 /* Allocate and initialize datamsg. */
-SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(int gfp)
+SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(unsigned int __nocast gfp)
 {
 	struct sctp_datamsg *msg;
 	msg = kmalloc(sizeof(struct sctp_datamsg), gfp);
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index c44bf4165c6e..e47ac0d1a6d6 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -67,7 +67,8 @@ static void sctp_endpoint_bh_rcv(struct sctp_endpoint *ep);
  * Initialize the base fields of the endpoint structure.
  */
 static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
-						struct sock *sk, int gfp)
+						struct sock *sk,
+						unsigned int __nocast gfp)
 {
 	struct sctp_sock *sp = sctp_sk(sk);
 	memset(ep, 0, sizeof(struct sctp_endpoint));
@@ -137,7 +138,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
 /* Create a sctp_endpoint with all that boring stuff initialized.
  * Returns NULL if there isn't enough memory.
  */
-struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, int gfp)
+struct sctp_endpoint *sctp_endpoint_new(struct sock *sk,
+					unsigned int __nocast gfp)
 {
 	struct sctp_endpoint *ep;
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index e7f37faba7c0..ce9245e71fca 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -219,7 +219,7 @@ static void sctp_free_local_addr_list(void)
 
 /* Copy the local addresses which are valid for 'scope' into 'bp'.  */
 int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope,
-			      int gfp, int copy_flags)
+			      unsigned int __nocast gfp, int copy_flags)
 {
 	struct sctp_sockaddr_entry *addr;
 	int error = 0;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 773cd93fa3d0..00d32b7c8266 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -78,7 +78,7 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
 static int sctp_process_param(struct sctp_association *asoc,
 			      union sctp_params param,
 			      const union sctp_addr *peer_addr,
-			      int gfp);
+			      unsigned int __nocast gfp);
 
 /* What was the inbound interface for this chunk? */
 int sctp_chunk_iif(const struct sctp_chunk *chunk)
@@ -174,7 +174,7 @@ void  sctp_init_cause(struct sctp_chunk *chunk, __u16 cause_code,
  */
 struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 			     const struct sctp_bind_addr *bp,
-			     int gfp, int vparam_len)
+			     unsigned int __nocast gfp, int vparam_len)
 {
 	sctp_inithdr_t init;
 	union sctp_params addrs;
@@ -261,7 +261,7 @@ nodata:
 
 struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 				 const struct sctp_chunk *chunk,
-				 int gfp, int unkparam_len)
+				 unsigned int __nocast gfp, int unkparam_len)
 {
 	sctp_inithdr_t initack;
 	struct sctp_chunk *retval;
@@ -1233,7 +1233,8 @@ void sctp_chunk_assign_tsn(struct sctp_chunk *chunk)
 
 /* Create a CLOSED association to use with an incoming packet.  */
 struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
-					struct sctp_chunk *chunk, int gfp)
+					struct sctp_chunk *chunk,
+					unsigned int __nocast gfp)
 {
 	struct sctp_association *asoc;
 	struct sk_buff *skb;
@@ -1348,7 +1349,7 @@ nodata:
 struct sctp_association *sctp_unpack_cookie(
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	struct sctp_chunk *chunk, int gfp,
+	struct sctp_chunk *chunk, unsigned int __nocast gfp,
 	int *error, struct sctp_chunk **errp)
 {
 	struct sctp_association *retval = NULL;
@@ -1812,7 +1813,7 @@ int sctp_verify_init(const struct sctp_association *asoc,
  */
 int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
 		      const union sctp_addr *peer_addr,
-		      sctp_init_chunk_t *peer_init, int gfp)
+		      sctp_init_chunk_t *peer_init, unsigned int __nocast gfp)
 {
 	union sctp_params param;
 	struct sctp_transport *transport;
@@ -1983,7 +1984,7 @@ nomem:
 static int sctp_process_param(struct sctp_association *asoc,
 			      union sctp_params param,
 			      const union sctp_addr *peer_addr,
-			      int gfp)
+			      unsigned int __nocast gfp)
 {
 	union sctp_addr addr;
 	int i;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 778639db125a..39c970b5b198 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -63,7 +63,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 				void *event_arg,
 			 	sctp_disposition_t status,
 				sctp_cmd_seq_t *commands,
-				int gfp);
+				unsigned int __nocast gfp);
 static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     sctp_state_t state,
 			     struct sctp_endpoint *ep,
@@ -71,7 +71,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     void *event_arg,
 			     sctp_disposition_t status,
 			     sctp_cmd_seq_t *commands,
-			     int gfp);
+			     unsigned int __nocast gfp);
 
 /********************************************************************
  * Helper functions
@@ -497,7 +497,8 @@ static void sctp_cmd_assoc_failed(sctp_cmd_seq_t *commands,
 static int sctp_cmd_process_init(sctp_cmd_seq_t *commands,
 				 struct sctp_association *asoc,
 				 struct sctp_chunk *chunk,
-				 sctp_init_chunk_t *peer_init, int gfp)
+				 sctp_init_chunk_t *peer_init,
+				 unsigned int __nocast gfp)
 {
 	int error;
 
@@ -852,7 +853,7 @@ int sctp_do_sm(sctp_event_t event_type, sctp_subtype_t subtype,
 	       struct sctp_endpoint *ep,
 	       struct sctp_association *asoc,
 	       void *event_arg,
-	       int gfp)
+	       unsigned int __nocast gfp)
 {
 	sctp_cmd_seq_t commands;
 	const sctp_sm_table_entry_t *state_fn;
@@ -897,7 +898,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     void *event_arg,
 			     sctp_disposition_t status,
 			     sctp_cmd_seq_t *commands,
-			     int gfp)
+			     unsigned int __nocast gfp)
 {
 	int error;
 
@@ -985,7 +986,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 				void *event_arg,
 			 	sctp_disposition_t status,
 				sctp_cmd_seq_t *commands,
-				int gfp)
+				unsigned int __nocast gfp)
 {
 	int error = 0;
 	int force;
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
index e627d2b451b6..25037daf3fa0 100644
--- a/net/sctp/ssnmap.c
+++ b/net/sctp/ssnmap.c
@@ -57,7 +57,8 @@ static inline size_t sctp_ssnmap_size(__u16 in, __u16 out)
 /* Create a new sctp_ssnmap.
  * Allocate room to store at least 'len' contiguous TSNs.
  */
-struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out, int gfp)
+struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
+				    unsigned int __nocast gfp)
 {
 	struct sctp_ssnmap *retval;
 	int size;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index a63b69179607..d2f04ebe5081 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -57,7 +57,7 @@
 /* Initialize a new transport from provided memory.  */
 static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 						  const union sctp_addr *addr,
-						  int gfp)
+						  unsigned int __nocast gfp)
 {
 	/* Copy in the address.  */
 	peer->ipaddr = *addr;
@@ -121,7 +121,8 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 }
 
 /* Allocate and initialize a new transport.  */
-struct sctp_transport *sctp_transport_new(const union sctp_addr *addr, int gfp)
+struct sctp_transport *sctp_transport_new(const union sctp_addr *addr,
+					  unsigned int __nocast gfp)
 {
         struct sctp_transport *transport;
 
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 17d0ff534735..0abd5101107c 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -74,7 +74,7 @@ SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags)
 
 /* Create a new sctp_ulpevent.  */
 SCTP_STATIC struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags,
-						    int gfp)
+						    unsigned int __nocast gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sk_buff *skb;
@@ -136,7 +136,7 @@ static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event)
 struct sctp_ulpevent  *sctp_ulpevent_make_assoc_change(
 	const struct sctp_association *asoc,
 	__u16 flags, __u16 state, __u16 error, __u16 outbound,
-	__u16 inbound, int gfp)
+	__u16 inbound, unsigned int __nocast gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_assoc_change *sac;
@@ -237,7 +237,7 @@ fail:
 struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
 	const struct sctp_association *asoc,
 	const struct sockaddr_storage *aaddr,
-	int flags, int state, int error, int gfp)
+	int flags, int state, int error, unsigned int __nocast gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_paddr_change  *spc;
@@ -350,7 +350,7 @@ fail:
  */
 struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
 	const struct sctp_association *asoc, struct sctp_chunk *chunk,
-	__u16 flags, int gfp)
+	__u16 flags, unsigned int __nocast gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_remote_error *sre;
@@ -448,7 +448,7 @@ fail:
  */
 struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
 	const struct sctp_association *asoc, struct sctp_chunk *chunk,
-	__u16 flags, __u32 error, int gfp)
+	__u16 flags, __u32 error, unsigned int __nocast gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_send_failed *ssf;
@@ -557,7 +557,7 @@ fail:
  */
 struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event(
 	const struct sctp_association *asoc,
-	__u16 flags, int gfp)
+	__u16 flags, unsigned int __nocast gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_shutdown_event *sse;
@@ -620,7 +620,7 @@ fail:
  * 5.3.1.6 SCTP_ADAPTION_INDICATION
  */
 struct sctp_ulpevent *sctp_ulpevent_make_adaption_indication(
-	const struct sctp_association *asoc, int gfp)
+	const struct sctp_association *asoc, unsigned int __nocast gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_adaption_event *sai;
@@ -657,7 +657,7 @@ fail:
  */
 struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
 						struct sctp_chunk *chunk,
-						int gfp)
+						unsigned int __nocast gfp)
 {
 	struct sctp_ulpevent *event = NULL;
 	struct sk_buff *skb;
@@ -718,7 +718,8 @@ fail:
  *   various events.
  */
 struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
-	const struct sctp_association *asoc, __u32 indication, int gfp)
+	const struct sctp_association *asoc, __u32 indication,
+	unsigned int __nocast gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_pdapi_event *pd;
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index d5dd2cf7ac4a..8bbc279d6c99 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -100,7 +100,7 @@ void sctp_ulpq_free(struct sctp_ulpq *ulpq)
 
 /* Process an incoming DATA chunk.  */
 int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
-			int gfp)
+			unsigned int __nocast gfp)
 {
 	struct sk_buff_head temp;
 	sctp_data_chunk_t *hdr;
@@ -778,7 +778,8 @@ static __u16 sctp_ulpq_renege_frags(struct sctp_ulpq *ulpq, __u16 needed)
 
 /* Partial deliver the first message as there is pressure on rwnd. */
 void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
-				struct sctp_chunk *chunk, int gfp)
+				struct sctp_chunk *chunk,
+				unsigned int __nocast gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_association *asoc;
@@ -802,7 +803,7 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
 
 /* Renege some packets to make room for an incoming chunk.  */
 void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
-		      int gfp)
+		      unsigned int __nocast gfp)
 {
 	struct sctp_association *asoc;
 	__u16 needed, freed;
@@ -841,7 +842,7 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
 /* Notify the application if an association is aborted and in
  * partial delivery mode.  Send up any pending received messages.
  */
-void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, int gfp)
+void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, unsigned int __nocast gfp)
 {
 	struct sctp_ulpevent *ev = NULL;
 	struct sock *sk;
-- 
cgit 


From f5b8adb4f5767415b7b00e32e4766a052e2ed4cc Mon Sep 17 00:00:00 2001
From: Jesper Juhl <juhl-lkml@dif.dk>
Date: Mon, 11 Jul 2005 20:59:03 -0700
Subject: [NET]: Trivial spelling fix patch for net/Kconfig

Signed-off-by: Jesper Juhl <juhl-lkml@dif.dk>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/Kconfig b/net/Kconfig
index 9251b28e8d5d..b9ca0a581bde 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -272,7 +272,7 @@ config ATM_BR2684
 	tristate "RFC1483/2684 Bridged protocols"
 	depends on ATM && INET
 	help
-	  ATM PVCs can carry ethernet PDUs according to rfc2684 (formerly 1483)
+	  ATM PVCs can carry ethernet PDUs according to RFC2684 (formerly 1483)
 	  This device will act like an ethernet from the kernels point of view,
 	  with the traffic being carried by ATM PVCs (currently 1 PVC/device).
 	  This is sometimes used over DSL lines.  If in doubt, say N.
@@ -281,7 +281,7 @@ config ATM_BR2684_IPFILTER
 	bool "Per-VC IP filter kludge"
 	depends on ATM_BR2684
 	help
-	  This is an experimental mechanism for users who need to terminating a
+	  This is an experimental mechanism for users who need to terminate a
 	  large number of IP-only vcc's.  Do not enable this unless you are sure
 	  you know what you are doing.
 
-- 
cgit 


From af9debd461d10fe582c9c0e80eafa69f698331ed Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 11 Jul 2005 20:59:57 -0700
Subject: [IPVS]: Add and reorder bh locks after moving to keventd.

An addition to the last ipvs changes that move
update_defense_level/si_meminfo to keventd:

- ip_vs_random_dropentry now runs in process context and should use _bh
  locks to protect from softirqs

- update_defense_level still needs _bh locks after si_meminfo is called,
  for the same purpose

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipvs/ip_vs_conn.c | 6 +++---
 net/ipv4/ipvs/ip_vs_ctl.c  | 9 ++++++---
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 9f16ab309106..d0145a8b1551 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -758,7 +758,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
 	return 1;
 }
 
-
+/* Called from keventd and must protect itself from softirqs */
 void ip_vs_random_dropentry(void)
 {
 	int idx;
@@ -773,7 +773,7 @@ void ip_vs_random_dropentry(void)
 		/*
 		 *  Lock is actually needed in this loop.
 		 */
-		ct_write_lock(hash);
+		ct_write_lock_bh(hash);
 
 		list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
 			if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
@@ -806,7 +806,7 @@ void ip_vs_random_dropentry(void)
 				ip_vs_conn_expire_now(cp->control);
 			}
 		}
-		ct_write_unlock(hash);
+		ct_write_unlock_bh(hash);
 	}
 }
 
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 12a82e91d22a..7d99ede2ef79 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -90,7 +90,8 @@ int ip_vs_get_debug_level(void)
 #endif
 
 /*
- *	update_defense_level is called from keventd and from sysctl.
+ *	update_defense_level is called from keventd and from sysctl,
+ *	so it needs to protect itself from softirqs
  */
 static void update_defense_level(void)
 {
@@ -110,6 +111,8 @@ static void update_defense_level(void)
 
 	nomem = (availmem < sysctl_ip_vs_amemthresh);
 
+	local_bh_disable();
+
 	/* drop_entry */
 	spin_lock(&__ip_vs_dropentry_lock);
 	switch (sysctl_ip_vs_drop_entry) {
@@ -206,6 +209,8 @@ static void update_defense_level(void)
 	if (to_change >= 0)
 		ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
 	write_unlock(&__ip_vs_securetcp_lock);
+
+	local_bh_enable();
 }
 
 
@@ -1360,9 +1365,7 @@ proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
 			/* Restore the correct value */
 			*valp = val;
 		} else {
-			local_bh_disable();
 			update_defense_level();
-			local_bh_enable();
 		}
 	}
 	return rc;
-- 
cgit 


From 0b7f22aab4e960c75e82ad696ef852f9b0015e7d Mon Sep 17 00:00:00 2001
From: Olaf Kirch <okir@suse.de>
Date: Mon, 11 Jul 2005 21:01:42 -0700
Subject: [IPV4]: Prevent oops when printing martian source

In some cases, we may be generating packets with a source address that
qualifies as martian. This can happen when we're in the middle of setting
up the network, and netfilter decides to reject a packet with an RST.
The IPv4 routing code would try to print a warning and oops, because
locally generated packets do not have a valid skb->mac.raw pointer
at this point.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 726ea5e8180a..d675ff80b04d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1685,7 +1685,7 @@ static void ip_handle_martian_source(struct net_device *dev,
 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
 			"%u.%u.%u.%u, on dev %s\n",
 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
-		if (dev->hard_header_len) {
+		if (dev->hard_header_len && skb->mac.raw) {
 			int i;
 			unsigned char *p = skb->mac.raw;
 			printk(KERN_WARNING "ll header: ");
-- 
cgit 


From d5950b4355049092739bea97d1bdc14433126cc5 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Mon, 11 Jul 2005 21:03:49 -0700
Subject: [NET]: add a top-level Networking menu to *config

Create a new top-level menu named "Networking" thus moving
net related options and protocol selection way from the drivers
menu and up on the top-level where they belong.

To implement this all architectures has to source "net/Kconfig" before
drivers/*/Kconfig in their Kconfig file. This change has been
implemented for all architectures.

Device drivers for ordinary NIC's are still to be found
in the Device Drivers section, but Bluetooth, IrDA and ax25
are located with their corresponding menu entries under the new
networking menu item.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/Kconfig | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/Kconfig b/net/Kconfig
index b9ca0a581bde..f46fc326c00b 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -2,7 +2,7 @@
 # Network configuration
 #
 
-menu "Networking support"
+menu "Networking"
 
 config NET
 	bool "Networking support"
@@ -10,7 +10,9 @@ config NET
 	  Unless you really know what you are doing, you should say Y here.
 	  The reason is that some programs need kernel networking support even
 	  when running on a stand-alone machine that isn't connected to any
-	  other computer. If you are upgrading from an older kernel, you
+	  other computer.
+	  
+	  If you are upgrading from an older kernel, you
 	  should consider updating your networking tools too because changes
 	  in the kernel and the tools often go hand in hand. The tools are
 	  contained in the package net-tools, the location and version number
@@ -640,7 +642,5 @@ source "net/irda/Kconfig"
 
 source "net/bluetooth/Kconfig"
 
-source "drivers/net/Kconfig"
-
-endmenu
+endmenu # Networking
 
-- 
cgit 


From 6a2e9b738cb5c929df73b6acabdd8f9a4e9a0416 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Mon, 11 Jul 2005 21:13:56 -0700
Subject: [NET]: move config options out to individual protocols

Move the protocol specific config options out to the specific protocols.
With this change net/Kconfig now starts to become readable and serve as a
good basis for further re-structuring.

The menu structure is left almost intact, except that indention is
fixed in most cases. Most visible are the INET changes where several
"depends on INET" are replaced with a single ifdef INET / endif pair.

Several new files were created to accomplish this change - they are
small but serve the purpose that config options are now distributed
out where they belongs.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/Kconfig     |  19 +++
 net/Kconfig           | 446 ++------------------------------------------------
 net/atm/Kconfig       |  74 +++++++++
 net/bridge/Kconfig    |  31 ++++
 net/decnet/Kconfig    |  23 +++
 net/econet/Kconfig    |  36 ++++
 net/ipv4/Kconfig      |  25 +--
 net/ipv4/ipvs/Kconfig |   4 +-
 net/ipv6/Kconfig      |  22 ++-
 net/ipx/Kconfig       |  33 ++++
 net/lapb/Kconfig      |  22 +++
 net/packet/Kconfig    |  26 +++
 net/sched/Kconfig     |  37 +++++
 net/unix/Kconfig      |  21 +++
 net/wanrouter/Kconfig |  29 ++++
 net/x25/Kconfig       |  36 ++++
 net/xfrm/Kconfig      |  15 ++
 17 files changed, 447 insertions(+), 452 deletions(-)
 create mode 100644 net/8021q/Kconfig
 create mode 100644 net/atm/Kconfig
 create mode 100644 net/bridge/Kconfig
 create mode 100644 net/econet/Kconfig
 create mode 100644 net/lapb/Kconfig
 create mode 100644 net/packet/Kconfig
 create mode 100644 net/unix/Kconfig
 create mode 100644 net/wanrouter/Kconfig
 create mode 100644 net/x25/Kconfig

(limited to 'net')

diff --git a/net/8021q/Kconfig b/net/8021q/Kconfig
new file mode 100644
index 000000000000..c4a382e450e2
--- /dev/null
+++ b/net/8021q/Kconfig
@@ -0,0 +1,19 @@
+#
+# Configuration for 802.1Q VLAN support
+#
+
+config VLAN_8021Q
+	tristate "802.1Q VLAN Support"
+	---help---
+	  Select this and you will be able to create 802.1Q VLAN interfaces
+	  on your ethernet interfaces.  802.1Q VLAN supports almost
+	  everything a regular ethernet interface does, including
+	  firewalling, bridging, and of course IP traffic.  You will need
+	  the 'vconfig' tool from the VLAN project in order to effectively
+	  use VLANs.  See the VLAN web page for more information:
+	  <http://www.candelatech.com/~greear/vlan.html>
+
+	  To compile this code as a module, choose M here: the module
+	  will be called 8021q.
+
+	  If unsure, say N.
diff --git a/net/Kconfig b/net/Kconfig
index f46fc326c00b..2684e809a649 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -22,57 +22,14 @@ config NET
 	  recommended to read the NET-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>.
 
-menu "Networking options"
-	depends on NET
-
-config PACKET
-	tristate "Packet socket"
-	---help---
-	  The Packet protocol is used by applications which communicate
-	  directly with network devices without an intermediate network
-	  protocol implemented in the kernel, e.g. tcpdump.  If you want them
-	  to work, choose Y.
-
-	  To compile this driver as a module, choose M here: the module will
-	  be called af_packet.
-
-	  If unsure, say Y.
+# Make sure that all config symbols are dependent on NET
+if NET
 
-config PACKET_MMAP
-	bool "Packet socket: mmapped IO"
-	depends on PACKET
-	help
-	  If you say Y here, the Packet protocol driver will use an IO
-	  mechanism that results in faster communication.
-
-	  If unsure, say N.
-
-config UNIX
-	tristate "Unix domain sockets"
-	---help---
-	  If you say Y here, you will include support for Unix domain sockets;
-	  sockets are the standard Unix mechanism for establishing and
-	  accessing network connections.  Many commonly used programs such as
-	  the X Window system and syslog use these sockets even if your
-	  machine is not connected to any network.  Unless you are working on
-	  an embedded system or something similar, you therefore definitely
-	  want to say Y here.
-
-	  To compile this driver as a module, choose M here: the module will be
-	  called unix.  Note that several important services won't work
-	  correctly if you say M here and then neglect to load the module.
-
-	  Say Y unless you know what you are doing.
-
-config NET_KEY
-	tristate "PF_KEY sockets"
-	select XFRM
-	---help---
-	  PF_KEYv2 socket family, compatible to KAME ones.
-	  They are required if you are going to use IPsec tools ported
-	  from KAME.
+menu "Networking options"
 
-	  Say Y unless you know what you are doing.
+source "net/packet/Kconfig"
+source "net/unix/Kconfig"
+source "net/xfrm/Kconfig"
 
 config INET
 	bool "TCP/IP networking"
@@ -96,30 +53,12 @@ config INET
 
 	  Short answer: say Y.
 
+if INET
 source "net/ipv4/Kconfig"
-
-#   IPv6 as module will cause a CRASH if you try to unload it
-config IPV6
-	tristate "The IPv6 protocol"
-	depends on INET
-	default m
-	select CRYPTO if IPV6_PRIVACY
-	select CRYPTO_MD5 if IPV6_PRIVACY
-	---help---
-	  This is complemental support for the IP version 6.
-	  You will still be able to do traditional IPv4 networking as well.
-
-	  For general information about IPv6, see
-	  <http://playground.sun.com/pub/ipng/html/ipng-main.html>.
-	  For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
-	  For specific information about IPv6 under Linux, read the HOWTO at
-	  <http://www.bieringer.de/linux/IPv6/>.
-
-	  To compile this protocol support as a module, choose M here: the 
-	  module will be called ipv6.
-
 source "net/ipv6/Kconfig"
 
+endif # if INET
+
 menuconfig NETFILTER
 	bool "Network packet filtering (replaces ipchains)"
 	---help---
@@ -208,269 +147,16 @@ source "net/bridge/netfilter/Kconfig"
 
 endif
 
-config XFRM
-       bool
-       depends on NET
-
-source "net/xfrm/Kconfig"
-
 source "net/sctp/Kconfig"
-
-config ATM
-	tristate "Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
-	---help---
-	  ATM is a high-speed networking technology for Local Area Networks
-	  and Wide Area Networks.  It uses a fixed packet size and is
-	  connection oriented, allowing for the negotiation of minimum
-	  bandwidth requirements.
-
-	  In order to participate in an ATM network, your Linux box needs an
-	  ATM networking card. If you have that, say Y here and to the driver
-	  of your ATM card below.
-
-	  Note that you need a set of user-space programs to actually make use
-	  of ATM.  See the file <file:Documentation/networking/atm.txt> for
-	  further details.
-
-config ATM_CLIP
-	tristate "Classical IP over ATM (EXPERIMENTAL)"
-	depends on ATM && INET
-	help
-	  Classical IP over ATM for PVCs and SVCs, supporting InARP and
-	  ATMARP. If you want to communication with other IP hosts on your ATM
-	  network, you will typically either say Y here or to "LAN Emulation
-	  (LANE)" below.
-
-config ATM_CLIP_NO_ICMP
-	bool "Do NOT send ICMP if no neighbour (EXPERIMENTAL)"
-	depends on ATM_CLIP
-	help
-	  Normally, an "ICMP host unreachable" message is sent if a neighbour
-	  cannot be reached because there is no VC to it in the kernel's
-	  ATMARP table. This may cause problems when ATMARP table entries are
-	  briefly removed during revalidation. If you say Y here, packets to
-	  such neighbours are silently discarded instead.
-
-config ATM_LANE
-	tristate "LAN Emulation (LANE) support (EXPERIMENTAL)"
-	depends on ATM
-	help
-	  LAN Emulation emulates services of existing LANs across an ATM
-	  network. Besides operating as a normal ATM end station client, Linux
-	  LANE client can also act as an proxy client bridging packets between
-	  ELAN and Ethernet segments. You need LANE if you want to try MPOA.
-
-config ATM_MPOA
-	tristate "Multi-Protocol Over ATM (MPOA) support (EXPERIMENTAL)"
-	depends on ATM && INET && ATM_LANE!=n
-	help
-	  Multi-Protocol Over ATM allows ATM edge devices such as routers,
-	  bridges and ATM attached hosts establish direct ATM VCs across
-	  subnetwork boundaries. These shortcut connections bypass routers
-	  enhancing overall network performance.
-
-config ATM_BR2684
-	tristate "RFC1483/2684 Bridged protocols"
-	depends on ATM && INET
-	help
-	  ATM PVCs can carry ethernet PDUs according to RFC2684 (formerly 1483)
-	  This device will act like an ethernet from the kernels point of view,
-	  with the traffic being carried by ATM PVCs (currently 1 PVC/device).
-	  This is sometimes used over DSL lines.  If in doubt, say N.
-
-config ATM_BR2684_IPFILTER
-	bool "Per-VC IP filter kludge"
-	depends on ATM_BR2684
-	help
-	  This is an experimental mechanism for users who need to terminate a
-	  large number of IP-only vcc's.  Do not enable this unless you are sure
-	  you know what you are doing.
-
-config BRIDGE
-	tristate "802.1d Ethernet Bridging"
-	---help---
-	  If you say Y here, then your Linux box will be able to act as an
-	  Ethernet bridge, which means that the different Ethernet segments it
-	  is connected to will appear as one Ethernet to the participants.
-	  Several such bridges can work together to create even larger
-	  networks of Ethernets using the IEEE 802.1 spanning tree algorithm.
-	  As this is a standard, Linux bridges will cooperate properly with
-	  other third party bridge products.
-
-	  In order to use the Ethernet bridge, you'll need the bridge
-	  configuration tools; see <file:Documentation/networking/bridge.txt>
-	  for location. Please read the Bridge mini-HOWTO for more
-	  information.
-
-	  If you enable iptables support along with the bridge support then you
-	  turn your bridge into a bridging IP firewall.
-	  iptables will then see the IP packets being bridged, so you need to
-	  take this into account when setting up your firewall rules.
-	  Enabling arptables support when bridging will let arptables see
-	  bridged ARP traffic in the arptables FORWARD chain.
-
-	  To compile this code as a module, choose M here: the module
-	  will be called bridge.
-
-	  If unsure, say N.
-
-config VLAN_8021Q
-	tristate "802.1Q VLAN Support"
-	---help---
-	  Select this and you will be able to create 802.1Q VLAN interfaces
-	  on your ethernet interfaces.  802.1Q VLAN supports almost
-	  everything a regular ethernet interface does, including
-	  firewalling, bridging, and of course IP traffic.  You will need
-	  the 'vconfig' tool from the VLAN project in order to effectively
-	  use VLANs.  See the VLAN web page for more information:
-	  <http://www.candelatech.com/~greear/vlan.html>
-
-	  To compile this code as a module, choose M here: the module
-	  will be called 8021q.
-
-	  If unsure, say N.
-
-config DECNET
-	tristate "DECnet Support"
-	---help---
-	  The DECnet networking protocol was used in many products made by
-	  Digital (now Compaq).  It provides reliable stream and sequenced
-	  packet communications over which run a variety of services similar
-	  to those which run over TCP/IP.
-
-	  To find some tools to use with the kernel layer support, please
-	  look at Patrick Caulfield's web site:
-	  <http://linux-decnet.sourceforge.net/>.
-
-	  More detailed documentation is available in
-	  <file:Documentation/networking/decnet.txt>.
-
-	  Be sure to say Y to "/proc file system support" and "Sysctl support"
-	  below when using DECnet, since you will need sysctl support to aid
-	  in configuration at run time.
-
-	  The DECnet code is also available as a module ( = code which can be
-	  inserted in and removed from the running kernel whenever you want).
-	  The module is called decnet.
-
+source "net/atm/Kconfig"
+source "net/bridge/Kconfig"
+source "net/8021q/Kconfig"
 source "net/decnet/Kconfig"
-
 source "net/llc/Kconfig"
-
-config IPX
-	tristate "The IPX protocol"
-	select LLC
-	---help---
-	  This is support for the Novell networking protocol, IPX, commonly
-	  used for local networks of Windows machines.  You need it if you
-	  want to access Novell NetWare file or print servers using the Linux
-	  Novell client ncpfs (available from
-	  <ftp://platan.vc.cvut.cz/pub/linux/ncpfs/>) or from
-	  within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO,
-	  available from <http://www.tldp.org/docs.html#howto>).  In order
-	  to do the former, you'll also have to say Y to "NCP file system
-	  support", below.
-
-	  IPX is similar in scope to IP, while SPX, which runs on top of IPX,
-	  is similar to TCP. There is also experimental support for SPX in
-	  Linux (see "SPX networking", below).
-
-	  To turn your Linux box into a fully featured NetWare file server and
-	  IPX router, say Y here and fetch either lwared from
-	  <ftp://ibiblio.org/pub/Linux/system/network/daemons/> or
-	  mars_nwe from <ftp://www.compu-art.de/mars_nwe/>. For more
-	  information, read the IPX-HOWTO available from
-	  <http://www.tldp.org/docs.html#howto>.
-
-	  General information about how to connect Linux, Windows machines and
-	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
-
-	  The IPX driver would enlarge your kernel by about 16 KB. To compile
-	  this driver as a module, choose M here: the module will be called ipx.
-	  Unless you want to integrate your Linux box with a local Novell
-	  network, say N.
-
 source "net/ipx/Kconfig"
-
-config ATALK
-	tristate "Appletalk protocol support"
-	select LLC
-	---help---
-	  AppleTalk is the protocol that Apple computers can use to communicate
-	  on a network.  If your Linux box is connected to such a network and you
-	  wish to connect to it, say Y.  You will need to use the netatalk package
-	  so that your Linux box can act as a print and file server for Macs as
-	  well as access AppleTalk printers.  Check out
-	  <http://www.zettabyte.net/netatalk/> on the WWW for details.
-	  EtherTalk is the name used for AppleTalk over Ethernet and the
-	  cheaper and slower LocalTalk is AppleTalk over a proprietary Apple
-	  network using serial links.  EtherTalk and LocalTalk are fully
-	  supported by Linux.
-
-	  General information about how to connect Linux, Windows machines and
-	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.  The
-	  NET-3-HOWTO, available from
-	  <http://www.tldp.org/docs.html#howto>, contains valuable
-	  information as well.
-
-	  To compile this driver as a module, choose M here: the module will be
-	  called appletalk. You almost certainly want to compile it as a
-	  module so you can restart your AppleTalk stack without rebooting
-	  your machine. I hear that the GNU boycott of Apple is over, so
-	  even politically correct people are allowed to say Y here.
-
 source "drivers/net/appletalk/Kconfig"
-
-config X25
-	tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
-	---help---
-	  X.25 is a set of standardized network protocols, similar in scope to
-	  frame relay; the one physical line from your box to the X.25 network
-	  entry point can carry several logical point-to-point connections
-	  (called "virtual circuits") to other computers connected to the X.25
-	  network. Governments, banks, and other organizations tend to use it
-	  to connect to each other or to form Wide Area Networks (WANs). Many
-	  countries have public X.25 networks. X.25 consists of two
-	  protocols: the higher level Packet Layer Protocol (PLP) (say Y here
-	  if you want that) and the lower level data link layer protocol LAPB
-	  (say Y to "LAPB Data Link Driver" below if you want that).
-
-	  You can read more about X.25 at <http://www.sangoma.com/x25.htm> and
-	  <http://www.cisco.com/univercd/cc/td/doc/product/software/ios11/cbook/cx25.htm>.
-	  Information about X.25 for Linux is contained in the files
-	  <file:Documentation/networking/x25.txt> and
-	  <file:Documentation/networking/x25-iface.txt>.
-
-	  One connects to an X.25 network either with a dedicated network card
-	  using the X.21 protocol (not yet supported by Linux) or one can do
-	  X.25 over a standard telephone line using an ordinary modem (say Y
-	  to "X.25 async driver" below) or over Ethernet using an ordinary
-	  Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link
-	  Driver" and "LAPB over Ethernet driver" below).
-
-	  To compile this driver as a module, choose M here: the module
-	  will be called x25. If unsure, say N.
-
-config LAPB
-	tristate "LAPB Data Link Driver (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
-	---help---
-	  Link Access Procedure, Balanced (LAPB) is the data link layer (i.e.
-	  the lower) part of the X.25 protocol. It offers a reliable
-	  connection service to exchange data frames with one other host, and
-	  it is used to transport higher level protocols (mostly X.25 Packet
-	  Layer, the higher part of X.25, but others are possible as well).
-	  Usually, LAPB is used with specialized X.21 network cards, but Linux
-	  currently supports LAPB only over Ethernet connections. If you want
-	  to use LAPB connections over Ethernet, say Y here and to "LAPB over
-	  Ethernet driver" below. Read
-	  <file:Documentation/networking/lapb-module.txt> for technical
-	  details.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called lapb.  If unsure, say N.
+source "net/x25/Kconfig"
+source "net/lapb/Kconfig"
 
 config NET_DIVERT
 	bool "Frame Diverter (EXPERIMENTAL)"
@@ -498,107 +184,10 @@ config NET_DIVERT
 
 	  If unsure, say N.
 
-config ECONET
-	tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && INET
-	---help---
-	  Econet is a fairly old and slow networking protocol mainly used by
-	  Acorn computers to access file and print servers. It uses native
-	  Econet network cards. AUN is an implementation of the higher level
-	  parts of Econet that runs over ordinary Ethernet connections, on
-	  top of the UDP packet protocol, which in turn runs on top of the
-	  Internet protocol IP.
-
-	  If you say Y here, you can choose with the next two options whether
-	  to send Econet/AUN traffic over a UDP Ethernet connection or over
-	  a native Econet network card.
-
-	  To compile this driver as a module, choose M here: the module
-	  will be called econet.
-
-config ECONET_AUNUDP
-	bool "AUN over UDP"
-	depends on ECONET
-	help
-	  Say Y here if you want to send Econet/AUN traffic over a UDP
-	  connection (UDP is a packet based protocol that runs on top of the
-	  Internet protocol IP) using an ordinary Ethernet network card.
-
-config ECONET_NATIVE
-	bool "Native Econet"
-	depends on ECONET
-	help
-	  Say Y here if you have a native Econet network card installed in
-	  your computer.
-
-config WAN_ROUTER
-	tristate "WAN router"
-	depends on EXPERIMENTAL
-	---help---
-	  Wide Area Networks (WANs), such as X.25, frame relay and leased
-	  lines, are used to interconnect Local Area Networks (LANs) over vast
-	  distances with data transfer rates significantly higher than those
-	  achievable with commonly used asynchronous modem connections.
-	  Usually, a quite expensive external device called a `WAN router' is
-	  needed to connect to a WAN.
-
-	  As an alternative, WAN routing can be built into the Linux kernel.
-	  With relatively inexpensive WAN interface cards available on the
-	  market, a perfectly usable router can be built for less than half
-	  the price of an external router.  If you have one of those cards and
-	  wish to use your Linux box as a WAN router, say Y here and also to
-	  the WAN driver for your card, below.  You will then need the
-	  wan-tools package which is available from <ftp://ftp.sangoma.com/>.
-	  Read <file:Documentation/networking/wan-router.txt> for more
-	  information.
-
-	  To compile WAN routing support as a module, choose M here: the
-	  module will be called wanrouter.
-
-	  If unsure, say N.
-
-menu "QoS and/or fair queueing"
-
-config NET_SCHED
-	bool "QoS and/or fair queueing"
-	---help---
-	  When the kernel has several packets to send out over a network
-	  device, it has to decide which ones to send first, which ones to
-	  delay, and which ones to drop. This is the job of the packet
-	  scheduler, and several different algorithms for how to do this
-	  "fairly" have been proposed.
-
-	  If you say N here, you will get the standard packet scheduler, which
-	  is a FIFO (first come, first served). If you say Y here, you will be
-	  able to choose from among several alternative algorithms which can
-	  then be attached to different network devices. This is useful for
-	  example if some of your network devices are real time devices that
-	  need a certain minimum data flow rate, or if you need to limit the
-	  maximum data flow rate for traffic which matches specified criteria.
-	  This code is considered to be experimental.
-
-	  To administer these schedulers, you'll need the user-level utilities
-	  from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
-	  That package also contains some documentation; for more, check out
-	  <http://snafu.freedom.org/linux2.2/iproute-notes.html>.
-
-	  This Quality of Service (QoS) support will enable you to use
-	  Differentiated Services (diffserv) and Resource Reservation Protocol
-	  (RSVP) on your Linux router if you also say Y to "QoS support",
-	  "Packet classifier API" and to some classifiers below. Documentation
-	  and software is at <http://diffserv.sourceforge.net/>.
-
-	  If you say Y here and to "/proc file system" below, you will be able
-	  to read status information about packet schedulers from the file
-	  /proc/net/psched.
-
-	  The available schedulers are listed in the following questions; you
-	  can say Y to as many as you like. If unsure, say N now.
-
+source "net/econet/Kconfig"
+source "net/wanrouter/Kconfig"
 source "net/sched/Kconfig"
 
-endmenu
-
 menu "Network testing"
 
 config NET_PKTGEN
@@ -637,10 +226,9 @@ config NET_POLL_CONTROLLER
 	def_bool NETPOLL
 
 source "net/ax25/Kconfig"
-
 source "net/irda/Kconfig"
-
 source "net/bluetooth/Kconfig"
 
+endif   # if NET
 endmenu # Networking
 
diff --git a/net/atm/Kconfig b/net/atm/Kconfig
new file mode 100644
index 000000000000..bea2426229b1
--- /dev/null
+++ b/net/atm/Kconfig
@@ -0,0 +1,74 @@
+#
+# Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)
+#
+
+config ATM
+	tristate "Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	---help---
+	  ATM is a high-speed networking technology for Local Area Networks
+	  and Wide Area Networks.  It uses a fixed packet size and is
+	  connection oriented, allowing for the negotiation of minimum
+	  bandwidth requirements.
+
+	  In order to participate in an ATM network, your Linux box needs an
+	  ATM networking card. If you have that, say Y here and to the driver
+	  of your ATM card below.
+
+	  Note that you need a set of user-space programs to actually make use
+	  of ATM.  See the file <file:Documentation/networking/atm.txt> for
+	  further details.
+
+config ATM_CLIP
+	tristate "Classical IP over ATM (EXPERIMENTAL)"
+	depends on ATM && INET
+	help
+	  Classical IP over ATM for PVCs and SVCs, supporting InARP and
+	  ATMARP. If you want to communication with other IP hosts on your ATM
+	  network, you will typically either say Y here or to "LAN Emulation
+	  (LANE)" below.
+
+config ATM_CLIP_NO_ICMP
+	bool "Do NOT send ICMP if no neighbour (EXPERIMENTAL)"
+	depends on ATM_CLIP
+	help
+	  Normally, an "ICMP host unreachable" message is sent if a neighbour
+	  cannot be reached because there is no VC to it in the kernel's
+	  ATMARP table. This may cause problems when ATMARP table entries are
+	  briefly removed during revalidation. If you say Y here, packets to
+	  such neighbours are silently discarded instead.
+
+config ATM_LANE
+	tristate "LAN Emulation (LANE) support (EXPERIMENTAL)"
+	depends on ATM
+	help
+	  LAN Emulation emulates services of existing LANs across an ATM
+	  network. Besides operating as a normal ATM end station client, Linux
+	  LANE client can also act as an proxy client bridging packets between
+	  ELAN and Ethernet segments. You need LANE if you want to try MPOA.
+
+config ATM_MPOA
+	tristate "Multi-Protocol Over ATM (MPOA) support (EXPERIMENTAL)"
+	depends on ATM && INET && ATM_LANE!=n
+	help
+	  Multi-Protocol Over ATM allows ATM edge devices such as routers,
+	  bridges and ATM attached hosts establish direct ATM VCs across
+	  subnetwork boundaries. These shortcut connections bypass routers
+	  enhancing overall network performance.
+
+config ATM_BR2684
+	tristate "RFC1483/2684 Bridged protocols"
+	depends on ATM && INET
+	help
+	  ATM PVCs can carry ethernet PDUs according to rfc2684 (formerly 1483)
+	  This device will act like an ethernet from the kernels point of view,
+	  with the traffic being carried by ATM PVCs (currently 1 PVC/device).
+	  This is sometimes used over DSL lines.  If in doubt, say N.
+
+config ATM_BR2684_IPFILTER
+	bool "Per-VC IP filter kludge"
+	depends on ATM_BR2684
+	help
+	  This is an experimental mechanism for users who need to terminating a
+	  large number of IP-only vcc's.  Do not enable this unless you are sure
+	  you know what you are doing.
diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
new file mode 100644
index 000000000000..db23d59746cf
--- /dev/null
+++ b/net/bridge/Kconfig
@@ -0,0 +1,31 @@
+#
+# 802.1d Ethernet Bridging
+#
+
+config BRIDGE
+	tristate "802.1d Ethernet Bridging"
+	---help---
+	  If you say Y here, then your Linux box will be able to act as an
+	  Ethernet bridge, which means that the different Ethernet segments it
+	  is connected to will appear as one Ethernet to the participants.
+	  Several such bridges can work together to create even larger
+	  networks of Ethernets using the IEEE 802.1 spanning tree algorithm.
+	  As this is a standard, Linux bridges will cooperate properly with
+	  other third party bridge products.
+
+	  In order to use the Ethernet bridge, you'll need the bridge
+	  configuration tools; see <file:Documentation/networking/bridge.txt>
+	  for location. Please read the Bridge mini-HOWTO for more
+	  information.
+
+	  If you enable iptables support along with the bridge support then you
+	  turn your bridge into a bridging IP firewall.
+	  iptables will then see the IP packets being bridged, so you need to
+	  take this into account when setting up your firewall rules.
+	  Enabling arptables support when bridging will let arptables see
+	  bridged ARP traffic in the arptables FORWARD chain.
+
+	  To compile this code as a module, choose M here: the module
+	  will be called bridge.
+
+	  If unsure, say N.
diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig
index 2101da542ba8..92f2ec46fd22 100644
--- a/net/decnet/Kconfig
+++ b/net/decnet/Kconfig
@@ -1,6 +1,29 @@
 #
 # DECnet configuration
 #
+config DECNET
+	tristate "DECnet Support"
+	---help---
+	  The DECnet networking protocol was used in many products made by
+	  Digital (now Compaq).  It provides reliable stream and sequenced
+	  packet communications over which run a variety of services similar
+	  to those which run over TCP/IP.
+
+	  To find some tools to use with the kernel layer support, please
+	  look at Patrick Caulfield's web site:
+	  <http://linux-decnet.sourceforge.net/>.
+
+	  More detailed documentation is available in
+	  <file:Documentation/networking/decnet.txt>.
+
+	  Be sure to say Y to "/proc file system support" and "Sysctl support"
+	  below when using DECnet, since you will need sysctl support to aid
+	  in configuration at run time.
+
+	  The DECnet code is also available as a module ( = code which can be
+	  inserted in and removed from the running kernel whenever you want).
+	  The module is called decnet.
+
 config DECNET_ROUTER
 	bool "DECnet: router support (EXPERIMENTAL)"
 	depends on DECNET && EXPERIMENTAL
diff --git a/net/econet/Kconfig b/net/econet/Kconfig
new file mode 100644
index 000000000000..39a2d2975e0e
--- /dev/null
+++ b/net/econet/Kconfig
@@ -0,0 +1,36 @@
+#
+# Acorn Econet/AUN protocols 
+#
+
+config ECONET
+	tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && INET
+	---help---
+	  Econet is a fairly old and slow networking protocol mainly used by
+	  Acorn computers to access file and print servers. It uses native
+	  Econet network cards. AUN is an implementation of the higher level
+	  parts of Econet that runs over ordinary Ethernet connections, on
+	  top of the UDP packet protocol, which in turn runs on top of the
+	  Internet protocol IP.
+
+	  If you say Y here, you can choose with the next two options whether
+	  to send Econet/AUN traffic over a UDP Ethernet connection or over
+	  a native Econet network card.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called econet.
+
+config ECONET_AUNUDP
+	bool "AUN over UDP"
+	depends on ECONET
+	help
+	  Say Y here if you want to send Econet/AUN traffic over a UDP
+	  connection (UDP is a packet based protocol that runs on top of the
+	  Internet protocol IP) using an ordinary Ethernet network card.
+
+config ECONET_NATIVE
+	bool "Native Econet"
+	depends on ECONET
+	help
+	  Say Y here if you have a native Econet network card installed in
+	  your computer.
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 3e63123f7bbd..df5386885a90 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -3,7 +3,6 @@
 #
 config IP_MULTICAST
 	bool "IP: multicasting"
-	depends on INET
 	help
 	  This is code for addressing several networked computers at once,
 	  enlarging your kernel by about 2 KB. You need multicasting if you
@@ -17,7 +16,6 @@ config IP_MULTICAST
 
 config IP_ADVANCED_ROUTER
 	bool "IP: advanced router"
-	depends on INET
 	---help---
 	  If you intend to run your Linux box mostly as a router, i.e. as a
 	  computer that forwards and redistributes network packets, say Y; you
@@ -183,7 +181,6 @@ config IP_ROUTE_VERBOSE
 
 config IP_PNP
 	bool "IP: kernel level autoconfiguration"
-	depends on INET
 	help
 	  This enables automatic configuration of IP addresses of devices and
 	  of the routing table during kernel boot, based on either information
@@ -242,7 +239,6 @@ config IP_PNP_RARP
 #   bool '    IP: ARP support' CONFIG_IP_PNP_ARP		
 config NET_IPIP
 	tristate "IP: tunneling"
-	depends on INET
 	select INET_TUNNEL
 	---help---
 	  Tunneling means encapsulating data of one protocol type within
@@ -260,7 +256,6 @@ config NET_IPIP
 
 config NET_IPGRE
 	tristate "IP: GRE tunnels over IP"
-	depends on INET
 	select XFRM
 	help
 	  Tunneling means encapsulating data of one protocol type within
@@ -319,7 +314,7 @@ config IP_PIMSM_V2
 
 config ARPD
 	bool "IP: ARP daemon support (EXPERIMENTAL)"
-	depends on INET && EXPERIMENTAL
+	depends on EXPERIMENTAL
 	---help---
 	  Normally, the kernel maintains an internal cache which maps IP
 	  addresses to hardware addresses on the local network, so that
@@ -344,7 +339,6 @@ config ARPD
 
 config SYN_COOKIES
 	bool "IP: TCP syncookie support (disabled per default)"
-	depends on INET
 	---help---
 	  Normal TCP/IP networking is open to an attack known as "SYN
 	  flooding". This denial-of-service attack prevents legitimate remote
@@ -381,7 +375,6 @@ config SYN_COOKIES
 
 config INET_AH
 	tristate "IP: AH transformation"
-	depends on INET
 	select XFRM
 	select CRYPTO
 	select CRYPTO_HMAC
@@ -394,7 +387,6 @@ config INET_AH
 
 config INET_ESP
 	tristate "IP: ESP transformation"
-	depends on INET
 	select XFRM
 	select CRYPTO
 	select CRYPTO_HMAC
@@ -408,7 +400,6 @@ config INET_ESP
 
 config INET_IPCOMP
 	tristate "IP: IPComp transformation"
-	depends on INET
 	select XFRM
 	select INET_TUNNEL
 	select CRYPTO
@@ -421,7 +412,6 @@ config INET_IPCOMP
 
 config INET_TUNNEL
 	tristate "IP: tunnel transformation"
-	depends on INET
 	select XFRM
 	---help---
 	  Support for generic IP tunnel transformation, which is required by
@@ -431,7 +421,6 @@ config INET_TUNNEL
 
 config IP_TCPDIAG
 	tristate "IP: TCP socket monitoring interface"
-	depends on INET
 	default y
 	---help---
 	  Support for TCP socket monitoring interface used by native Linux
@@ -447,7 +436,6 @@ config IP_TCPDIAG_IPV6
 
 config TCP_CONG_ADVANCED
 	bool "TCP: advanced congestion control"
-	depends on INET
 	---help---
 	  Support for selection of various TCP congestion control
 	  modules.
@@ -463,7 +451,6 @@ menu "TCP congestion control"
 
 config TCP_CONG_BIC
 	tristate "Binary Increase Congestion (BIC) control"
-	depends on INET
 	default y
 	---help---
 	BIC-TCP is a sender-side only change that ensures a linear RTT
@@ -478,7 +465,6 @@ config TCP_CONG_BIC
 
 config TCP_CONG_WESTWOOD
 	tristate "TCP Westwood+"
-	depends on INET
 	default m
 	---help---
 	TCP Westwood+ is a sender-side only modification of the TCP Reno
@@ -493,7 +479,6 @@ config TCP_CONG_WESTWOOD
 
 config TCP_CONG_HTCP
         tristate "H-TCP"
-	depends on INET
         default m
 	---help---
 	H-TCP is a send-side only modifications of the TCP Reno
@@ -505,7 +490,7 @@ config TCP_CONG_HTCP
 
 config TCP_CONG_HSTCP
 	tristate "High Speed TCP"
-	depends on INET && EXPERIMENTAL
+	depends on EXPERIMENTAL
 	default n
 	---help---
 	Sally Floyd's High Speed TCP (RFC 3649) congestion control.
@@ -516,7 +501,7 @@ config TCP_CONG_HSTCP
 
 config TCP_CONG_HYBLA
 	tristate "TCP-Hybla congestion control algorithm"
-	depends on INET && EXPERIMENTAL
+	depends on EXPERIMENTAL
 	default n
 	---help---
 	TCP-Hybla is a sender-side only change that eliminates penalization of
@@ -526,7 +511,7 @@ config TCP_CONG_HYBLA
 
 config TCP_CONG_VEGAS
 	tristate "TCP Vegas"
-	depends on INET && EXPERIMENTAL
+	depends on EXPERIMENTAL
 	default n
 	---help---
 	TCP Vegas is a sender-side only change to TCP that anticipates
@@ -537,7 +522,7 @@ config TCP_CONG_VEGAS
 
 config TCP_CONG_SCALABLE
 	tristate "Scalable TCP"
-	depends on INET && EXPERIMENTAL
+	depends on EXPERIMENTAL
 	default n
 	---help---
 	Scalable TCP is a sender-side only change to TCP which uses a
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
index 63a82b4b64bb..c9820bfc493a 100644
--- a/net/ipv4/ipvs/Kconfig
+++ b/net/ipv4/ipvs/Kconfig
@@ -2,11 +2,11 @@
 # IP Virtual Server configuration
 #
 menu	"IP: Virtual Server Configuration"
-	depends on INET && NETFILTER
+	depends on NETFILTER
 
 config	IP_VS
 	tristate "IP virtual server support (EXPERIMENTAL)"
-	depends on INET && NETFILTER
+	depends on NETFILTER
 	---help---
 	  IP Virtual Server support will let you build a high-performance
 	  virtual server based on cluster of two or more real servers. This
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index e66ca9381cfd..95163cd52ae0 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -1,6 +1,26 @@
 #
 # IPv6 configuration
-# 
+#
+
+#   IPv6 as module will cause a CRASH if you try to unload it
+config IPV6
+	tristate "The IPv6 protocol"
+	default m
+	select CRYPTO if IPV6_PRIVACY
+	select CRYPTO_MD5 if IPV6_PRIVACY
+	---help---
+	  This is complemental support for the IP version 6.
+	  You will still be able to do traditional IPv4 networking as well.
+
+	  For general information about IPv6, see
+	  <http://playground.sun.com/pub/ipng/html/ipng-main.html>.
+	  For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
+	  For specific information about IPv6 under Linux, read the HOWTO at
+	  <http://www.bieringer.de/linux/IPv6/>.
+
+	  To compile this protocol support as a module, choose M here: the 
+	  module will be called ipv6.
+
 config IPV6_PRIVACY
 	bool "IPv6: Privacy Extensions (RFC 3041) support"
 	depends on IPV6
diff --git a/net/ipx/Kconfig b/net/ipx/Kconfig
index a16237c0e783..980a826f5d02 100644
--- a/net/ipx/Kconfig
+++ b/net/ipx/Kconfig
@@ -1,6 +1,39 @@
 #
 # IPX configuration
 #
+config IPX
+	tristate "The IPX protocol"
+	select LLC
+	---help---
+	  This is support for the Novell networking protocol, IPX, commonly
+	  used for local networks of Windows machines.  You need it if you
+	  want to access Novell NetWare file or print servers using the Linux
+	  Novell client ncpfs (available from
+	  <ftp://platan.vc.cvut.cz/pub/linux/ncpfs/>) or from
+	  within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO,
+	  available from <http://www.tldp.org/docs.html#howto>).  In order
+	  to do the former, you'll also have to say Y to "NCP file system
+	  support", below.
+
+	  IPX is similar in scope to IP, while SPX, which runs on top of IPX,
+	  is similar to TCP. There is also experimental support for SPX in
+	  Linux (see "SPX networking", below).
+
+	  To turn your Linux box into a fully featured NetWare file server and
+	  IPX router, say Y here and fetch either lwared from
+	  <ftp://ibiblio.org/pub/Linux/system/network/daemons/> or
+	  mars_nwe from <ftp://www.compu-art.de/mars_nwe/>. For more
+	  information, read the IPX-HOWTO available from
+	  <http://www.tldp.org/docs.html#howto>.
+
+	  General information about how to connect Linux, Windows machines and
+	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
+
+	  The IPX driver would enlarge your kernel by about 16 KB. To compile
+	  this driver as a module, choose M here: the module will be called ipx.
+	  Unless you want to integrate your Linux box with a local Novell
+	  network, say N.
+
 config IPX_INTERN
 	bool "IPX: Full internal IPX network"
 	depends on IPX
diff --git a/net/lapb/Kconfig b/net/lapb/Kconfig
new file mode 100644
index 000000000000..f0b5efb31a00
--- /dev/null
+++ b/net/lapb/Kconfig
@@ -0,0 +1,22 @@
+#
+# LAPB Data Link Drive
+#
+
+config LAPB
+	tristate "LAPB Data Link Driver (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	---help---
+	  Link Access Procedure, Balanced (LAPB) is the data link layer (i.e.
+	  the lower) part of the X.25 protocol. It offers a reliable
+	  connection service to exchange data frames with one other host, and
+	  it is used to transport higher level protocols (mostly X.25 Packet
+	  Layer, the higher part of X.25, but others are possible as well).
+	  Usually, LAPB is used with specialized X.21 network cards, but Linux
+	  currently supports LAPB only over Ethernet connections. If you want
+	  to use LAPB connections over Ethernet, say Y here and to "LAPB over
+	  Ethernet driver" below. Read
+	  <file:Documentation/networking/lapb-module.txt> for technical
+	  details.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called lapb.  If unsure, say N.
diff --git a/net/packet/Kconfig b/net/packet/Kconfig
new file mode 100644
index 000000000000..34ff93ff894d
--- /dev/null
+++ b/net/packet/Kconfig
@@ -0,0 +1,26 @@
+#
+# Packet configuration
+#
+
+config PACKET
+	tristate "Packet socket"
+	---help---
+	  The Packet protocol is used by applications which communicate
+	  directly with network devices without an intermediate network
+	  protocol implemented in the kernel, e.g. tcpdump.  If you want them
+	  to work, choose Y.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called af_packet.
+
+	  If unsure, say Y.
+
+config PACKET_MMAP
+	bool "Packet socket: mmapped IO"
+	depends on PACKET
+	help
+	  If you say Y here, the Packet protocol driver will use an IO
+	  mechanism that results in faster communication.
+
+	  If unsure, say N.
+
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 7bac249258e3..59d3e71f8b85 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -1,6 +1,43 @@
 #
 # Traffic control configuration.
 # 
+
+menuconfig NET_SCHED
+	bool "QoS and/or fair queueing"
+	---help---
+	  When the kernel has several packets to send out over a network
+	  device, it has to decide which ones to send first, which ones to
+	  delay, and which ones to drop. This is the job of the packet
+	  scheduler, and several different algorithms for how to do this
+	  "fairly" have been proposed.
+
+	  If you say N here, you will get the standard packet scheduler, which
+	  is a FIFO (first come, first served). If you say Y here, you will be
+	  able to choose from among several alternative algorithms which can
+	  then be attached to different network devices. This is useful for
+	  example if some of your network devices are real time devices that
+	  need a certain minimum data flow rate, or if you need to limit the
+	  maximum data flow rate for traffic which matches specified criteria.
+	  This code is considered to be experimental.
+
+	  To administer these schedulers, you'll need the user-level utilities
+	  from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
+	  That package also contains some documentation; for more, check out
+	  <http://snafu.freedom.org/linux2.2/iproute-notes.html>.
+
+	  This Quality of Service (QoS) support will enable you to use
+	  Differentiated Services (diffserv) and Resource Reservation Protocol
+	  (RSVP) on your Linux router if you also say Y to "QoS support",
+	  "Packet classifier API" and to some classifiers below. Documentation
+	  and software is at <http://diffserv.sourceforge.net/>.
+
+	  If you say Y here and to "/proc file system" below, you will be able
+	  to read status information about packet schedulers from the file
+	  /proc/net/psched.
+
+	  The available schedulers are listed in the following questions; you
+	  can say Y to as many as you like. If unsure, say N now.
+
 choice
 	prompt "Packet scheduler clock source"
 	depends on NET_SCHED
diff --git a/net/unix/Kconfig b/net/unix/Kconfig
new file mode 100644
index 000000000000..5a69733bcdad
--- /dev/null
+++ b/net/unix/Kconfig
@@ -0,0 +1,21 @@
+#
+# Unix Domain Sockets
+#
+
+config UNIX
+	tristate "Unix domain sockets"
+	---help---
+	  If you say Y here, you will include support for Unix domain sockets;
+	  sockets are the standard Unix mechanism for establishing and
+	  accessing network connections.  Many commonly used programs such as
+	  the X Window system and syslog use these sockets even if your
+	  machine is not connected to any network.  Unless you are working on
+	  an embedded system or something similar, you therefore definitely
+	  want to say Y here.
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called unix.  Note that several important services won't work
+	  correctly if you say M here and then neglect to load the module.
+
+	  Say Y unless you know what you are doing.
+
diff --git a/net/wanrouter/Kconfig b/net/wanrouter/Kconfig
new file mode 100644
index 000000000000..1debe1cb054e
--- /dev/null
+++ b/net/wanrouter/Kconfig
@@ -0,0 +1,29 @@
+#
+# Configuration for WAN router
+#
+
+config WAN_ROUTER
+	tristate "WAN router"
+	depends on EXPERIMENTAL
+	---help---
+	  Wide Area Networks (WANs), such as X.25, frame relay and leased
+	  lines, are used to interconnect Local Area Networks (LANs) over vast
+	  distances with data transfer rates significantly higher than those
+	  achievable with commonly used asynchronous modem connections.
+	  Usually, a quite expensive external device called a `WAN router' is
+	  needed to connect to a WAN.
+
+	  As an alternative, WAN routing can be built into the Linux kernel.
+	  With relatively inexpensive WAN interface cards available on the
+	  market, a perfectly usable router can be built for less than half
+	  the price of an external router.  If you have one of those cards and
+	  wish to use your Linux box as a WAN router, say Y here and also to
+	  the WAN driver for your card, below.  You will then need the
+	  wan-tools package which is available from <ftp://ftp.sangoma.com/>.
+	  Read <file:Documentation/networking/wan-router.txt> for more
+	  information.
+
+	  To compile WAN routing support as a module, choose M here: the
+	  module will be called wanrouter.
+
+	  If unsure, say N.
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
new file mode 100644
index 000000000000..e6759c9660bb
--- /dev/null
+++ b/net/x25/Kconfig
@@ -0,0 +1,36 @@
+#
+# CCITT X.25 Packet Layer
+#
+
+config X25
+	tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	---help---
+	  X.25 is a set of standardized network protocols, similar in scope to
+	  frame relay; the one physical line from your box to the X.25 network
+	  entry point can carry several logical point-to-point connections
+	  (called "virtual circuits") to other computers connected to the X.25
+	  network. Governments, banks, and other organizations tend to use it
+	  to connect to each other or to form Wide Area Networks (WANs). Many
+	  countries have public X.25 networks. X.25 consists of two
+	  protocols: the higher level Packet Layer Protocol (PLP) (say Y here
+	  if you want that) and the lower level data link layer protocol LAPB
+	  (say Y to "LAPB Data Link Driver" below if you want that).
+
+	  You can read more about X.25 at <http://www.sangoma.com/x25.htm> and
+	  <http://www.cisco.com/univercd/cc/td/doc/product/software/ios11/cbook/cx25.htm>.
+	  Information about X.25 for Linux is contained in the files
+	  <file:Documentation/networking/x25.txt> and
+	  <file:Documentation/networking/x25-iface.txt>.
+
+	  One connects to an X.25 network either with a dedicated network card
+	  using the X.21 protocol (not yet supported by Linux) or one can do
+	  X.25 over a standard telephone line using an ordinary modem (say Y
+	  to "X.25 async driver" below) or over Ethernet using an ordinary
+	  Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link
+	  Driver" and "LAPB over Ethernet driver" below).
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called x25. If unsure, say N.
+
+
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 58ca6a972c48..0c1c04322baf 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -1,6 +1,10 @@
 #
 # XFRM configuration
 #
+config XFRM
+       bool
+       depends on NET
+
 config XFRM_USER
 	tristate "IPsec user configuration interface"
 	depends on INET && XFRM
@@ -10,3 +14,14 @@ config XFRM_USER
 
 	  If unsure, say Y.
 
+config NET_KEY
+	tristate "PF_KEY sockets"
+	select XFRM
+	---help---
+	  PF_KEYv2 socket family, compatible to KAME ones.
+	  They are required if you are going to use IPsec tools ported
+	  from KAME.
+
+	  Say Y unless you know what you are doing.
+
+
-- 
cgit 


From 84531c24f27b02daa8e54e2bb6dc74a730fdf0a5 Mon Sep 17 00:00:00 2001
From: Phil Oester <kernel@linuxace.com>
Date: Tue, 12 Jul 2005 11:57:52 -0700
Subject: [NETFILTER]: Revert nf_reset change

Revert the nf_reset change that caused so much trouble, drop conntrack
references manually before packets are queued to packet sockets.

Signed-off-by: Phil Oester <kernel@linuxace.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c                         | 9 ---------
 net/ipv4/netfilter/ip_conntrack_standalone.c | 7 +++++++
 net/packet/af_packet.c                       | 6 ++++++
 3 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9de83e6e0f1d..80d13103b2b0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -107,7 +107,6 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 	newskb->pkt_type = PACKET_LOOPBACK;
 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
 	BUG_TRAP(newskb->dst);
-	nf_reset(newskb);
 	netif_rx(newskb);
 	return 0;
 }
@@ -188,14 +187,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 		skb = skb2;
 	}
 
-#ifdef CONFIG_BRIDGE_NETFILTER
-	/* bridge-netfilter defers calling some IP hooks to the bridge layer
-	 * and still needs the conntrack reference.
-	 */
-	if (skb->nf_bridge == NULL)
-#endif
-		nf_reset(skb);
-
 	if (hh) {
 		int hh_alen;
 
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 42dc95102873..1dd824f3cf0a 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -432,6 +432,13 @@ static unsigned int ip_conntrack_defrag(unsigned int hooknum,
 				        const struct net_device *out,
 				        int (*okfn)(struct sk_buff *))
 {
+#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
+	/* Previously seen (loopback)?  Ignore.  Do this before
+           fragment check. */
+	if ((*pskb)->nfct)
+		return NF_ACCEPT;
+#endif
+
 	/* Gather fragments. */
 	if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
 		*pskb = ip_ct_gather_frags(*pskb,
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 0269616e75a1..c9d5980aa4de 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -274,6 +274,9 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct
 	dst_release(skb->dst);
 	skb->dst = NULL;
 
+	/* drop conntrack reference */
+	nf_reset(skb);
+
 	spkt = (struct sockaddr_pkt*)skb->cb;
 
 	skb_push(skb, skb->data-skb->mac.raw);
@@ -517,6 +520,9 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,  struct packe
 	dst_release(skb->dst);
 	skb->dst = NULL;
 
+	/* drop conntrack reference */
+	nf_reset(skb);
+
 	spin_lock(&sk->sk_receive_queue.lock);
 	po->stats.tp_packets++;
 	__skb_queue_tail(&sk->sk_receive_queue, skb);
-- 
cgit 


From ab611487d8ada506e511d2b8f22fb8e7be9939b9 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 12 Jul 2005 12:08:43 -0700
Subject: [NET]: __be'ify *_type_trans()

tr_type_trans(), hippi_type_trans() left as-is.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/802/fddi.c          | 4 ++--
 net/atm/br2684.c        | 3 +--
 net/ethernet/eth.c      | 2 +-
 net/wanrouter/wanmain.c | 6 +++---
 4 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/802/fddi.c b/net/802/fddi.c
index ebcf4830d6f1..5ce24c4bb840 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -122,10 +122,10 @@ static int fddi_rebuild_header(struct sk_buff	*skb)
  * the proper pointer to the start of packet data (skb->data).
  */
  
-unsigned short fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
+__be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	struct fddihdr *fddi = (struct fddihdr *)skb->data;
-	unsigned short type;
+	__be16 type;
 	
 	/*
 	 * Set mac.raw field to point to FC byte, set data field to point
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index e6954cf1459d..289956c4dd3e 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -289,8 +289,7 @@ xmit will add the additional header part in that case */
  * This is similar to eth_type_trans, which cannot be used because of
  * our dev->hard_header_len
  */
-static inline unsigned short br_type_trans(struct sk_buff *skb,
-					       struct net_device *dev)
+static inline __be16 br_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ethhdr *eth;
 	unsigned char *rawp;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index ab60ea63688e..f6dbfb99b14d 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -155,7 +155,7 @@ int eth_rebuild_header(struct sk_buff *skb)
  *	This is normal practice and works for any 'now in use' protocol.
  */
  
-unsigned short eth_type_trans(struct sk_buff *skb, struct net_device *dev)
+__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ethhdr *eth;
 	unsigned char *rawp;
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index d6844ac226f5..13b650ad22e2 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -358,10 +358,10 @@ int wanrouter_encapsulate(struct sk_buff *skb, struct net_device *dev,
  */
 
 
-unsigned short wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
+__be16 wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	int cnt = skb->data[0] ? 0 : 1;	/* there may be a pad present */
-	unsigned short ethertype;
+	__be16 ethertype;
 
 	switch (skb->data[cnt]) {
 	case NLPID_IP:		/* IP datagramm */
@@ -379,7 +379,7 @@ unsigned short wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
 				skb->data[cnt+3], dev->name);
 			return 0;
 		}
-		ethertype = *((unsigned short*)&skb->data[cnt+4]);
+		ethertype = *((__be16*)&skb->data[cnt+4]);
 		cnt += 6;
 		break;
 
-- 
cgit 


From f4637b55ba960d9987a836617271659e9b7b0de8 Mon Sep 17 00:00:00 2001
From: Tommy Christensen <tommy.christensen@tpack.net>
Date: Tue, 12 Jul 2005 12:13:49 -0700
Subject: [VLAN]: Fix early vlan adding leads to not functional device

OK, I can see what's happening here. eth0 doesn't detect link-up until
after a few seconds, so when the vlan interface is opened immediately
after eth0 has been opened, it inherits the link-down state. Subsequently
the vlan interface is never properly activated and are thus unable to
transmit any packets.

dev->state bits are not supposed to be manipulated directly. Something
similar is probably needed for the netif_device_present() bit, although
I don't know how this is meant to work for a virtual device.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net')

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 1f6d31670bc7..91e412b0ab00 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -578,6 +578,14 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 			if (!vlandev)
 				continue;
 
+			if (netif_carrier_ok(dev)) {
+				if (!netif_carrier_ok(vlandev))
+					netif_carrier_on(vlandev);
+			} else {
+				if (netif_carrier_ok(vlandev))
+					netif_carrier_off(vlandev);
+			}
+
 			if ((vlandev->state & VLAN_LINK_STATE_MASK) != flgs) {
 				vlandev->state = (vlandev->state &~ VLAN_LINK_STATE_MASK) 
 					| flgs;
-- 
cgit 


From d7c7ed4dbc66c5f4dfa5615cdfc5009af3b3b137 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Mon, 18 Jul 2005 13:29:49 -0700
Subject: [PKT_SCHED]: Remove debugging leftover from textsearch ematch

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/em_text.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/em_text.c b/net/sched/em_text.c
index 873840d8d072..77beabc91fa3 100644
--- a/net/sched/em_text.c
+++ b/net/sched/em_text.c
@@ -55,9 +55,6 @@ static int em_text_change(struct tcf_proto *tp, void *data, int len,
 	struct ts_config *ts_conf;
 	int flags = 0;
 
-	printk("Configuring text: %s from %d:%d to %d:%d len %d\n", conf->algo, conf->from_offset,
-	    conf->from_layer, conf->to_offset, conf->to_layer, conf->pattern_len);
-
 	if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
 		return -EINVAL;
 
-- 
cgit 


From 452f299da3253f65020143f743c2e207b752547b Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Mon, 18 Jul 2005 13:30:53 -0700
Subject: [PKT_SCHED]: Reduce branch mispredictions in pfifo_fast_dequeue

The current call to __qdisc_dequeue_head leads to a branch
misprediction for every loop iteration, the fact that the
most common priority is 2 makes this even worse.  This issue
has been brought up by Eric Dumazet <dada1@cosmosbay.com>
but unlike his solution which was to manually unroll the loop,
this approach preserves the possibility to increase the number
of bands at compile time.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 73e218e646ac..8edefd5d095d 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -331,11 +331,10 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
 	int prio;
 	struct sk_buff_head *list = qdisc_priv(qdisc);
 
-	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++, list++) {
-		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
-		if (skb) {
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+		if (!skb_queue_empty(list + prio)) {
 			qdisc->q.qlen--;
-			return skb;
+			return __qdisc_dequeue_head(qdisc, list + prio);
 		}
 	}
 
-- 
cgit 


From 37da647d994cdac7e0bc8d2a365fbda403939a2b Mon Sep 17 00:00:00 2001
From: Victor Fusco <victor@cetuc.puc-rio.br>
Date: Mon, 18 Jul 2005 13:35:43 -0700
Subject: [NETLINK]: Fix "nocast type" warnings

From: Victor Fusco <victor@cetuc.puc-rio.br>

Fix the sparse warning "implicit cast to nocast type"

Signed-off-by: Victor Fusco <victor@cetuc.puc-rio.br>
Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 3405fdf41b93..ff774a06c89d 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -648,7 +648,8 @@ void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
 	sock_put(sk);
 }
 
-static inline struct sk_buff *netlink_trim(struct sk_buff *skb, int allocation)
+static inline struct sk_buff *netlink_trim(struct sk_buff *skb,
+					   unsigned int __nocast allocation)
 {
 	int delta;
 
@@ -717,7 +718,7 @@ struct netlink_broadcast_data {
 	int failure;
 	int congested;
 	int delivered;
-	int allocation;
+	unsigned int allocation;
 	struct sk_buff *skb, *skb2;
 };
 
-- 
cgit 


From ee71a29eb5e341fe977c5ad7a43782c29bd9cb9e Mon Sep 17 00:00:00 2001
From: Christophe Lucas <clucas@rotomalug.org>
Date: Mon, 18 Jul 2005 13:38:07 -0700
Subject: [SCTP]: Audit return code of create_proc_*

From: Christophe Lucas <clucas@rotomalug.org>

Audit return of create_proc_* functions.

Signed-off-by: Christophe Lucas <clucas@rotomalug.org>
Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/objcnt.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index 0781e5d509fd..8ff588f0d76a 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -127,8 +127,12 @@ done:
 /* Initialize the objcount in the proc filesystem.  */
 void sctp_dbg_objcnt_init(void)
 {
-	create_proc_read_entry("sctp_dbg_objcnt", 0, proc_net_sctp,
+	struct proc_dir_entry *ent;
+	ent = create_proc_read_entry("sctp_dbg_objcnt", 0, proc_net_sctp,
 			       sctp_dbg_objcnt_read, NULL);
+	if (!ent)
+		printk(KERN_WARNING 
+			"sctp_dbg_objcnt: Unable to create /proc entry.\n");
 }
 
 /* Cleanup the objcount entry in the proc filesystem.  */
-- 
cgit 


From d1ad1ff299dd908d07c5e5f27f88bbdb235eb7a5 Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala <sri@us.ibm.com>
Date: Mon, 18 Jul 2005 13:44:10 -0700
Subject: [SCTP]: Fix potential null pointer dereference while handling an icmp
 error

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/input.c | 45 ++++++++++++---------------------------------
 net/sctp/ipv6.c  |  7 +++----
 2 files changed, 15 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/net/sctp/input.c b/net/sctp/input.c
index 5e085e041a6e..742be9171b7d 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -351,7 +351,6 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
  *
  */
 void sctp_icmp_proto_unreachable(struct sock *sk,
-                           struct sctp_endpoint *ep,
                            struct sctp_association *asoc,
                            struct sctp_transport *t)
 {
@@ -367,7 +366,6 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
 /* Common lookup code for icmp/icmpv6 error handler. */
 struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
 			     struct sctphdr *sctphdr,
-			     struct sctp_endpoint **epp,
 			     struct sctp_association **app,
 			     struct sctp_transport **tpp)
 {
@@ -375,11 +373,10 @@ struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
 	union sctp_addr daddr;
 	struct sctp_af *af;
 	struct sock *sk = NULL;
-	struct sctp_endpoint *ep = NULL;
 	struct sctp_association *asoc = NULL;
 	struct sctp_transport *transport = NULL;
 
-	*app = NULL; *epp = NULL; *tpp = NULL;
+	*app = NULL; *tpp = NULL;
 
 	af = sctp_get_af_specific(family);
 	if (unlikely(!af)) {
@@ -394,26 +391,15 @@ struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
 	 * packet.
 	 */
 	asoc = __sctp_lookup_association(&saddr, &daddr, &transport);
-	if (!asoc) {
-		/* If there is no matching association, see if it matches any
-		 * endpoint. This may happen for an ICMP error generated in
-		 * response to an INIT_ACK.
-		 */
-		ep = __sctp_rcv_lookup_endpoint(&daddr);
-		if (!ep) {
-			return NULL;
-		}
-	}
+	if (!asoc)
+		return NULL;
 
-	if (asoc) {
-		sk = asoc->base.sk;
+	sk = asoc->base.sk;
 
-		if (ntohl(sctphdr->vtag) != asoc->c.peer_vtag) {
-			ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
-			goto out;
-		}
-	} else
-		sk = ep->base.sk;
+	if (ntohl(sctphdr->vtag) != asoc->c.peer_vtag) {
+		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+		goto out;
+	}
 
 	sctp_bh_lock_sock(sk);
 
@@ -423,7 +409,6 @@ struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
 	if (sock_owned_by_user(sk))
 		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 
-	*epp = ep;
 	*app = asoc;
 	*tpp = transport;
 	return sk;
@@ -432,21 +417,16 @@ out:
 	sock_put(sk);
 	if (asoc)
 		sctp_association_put(asoc);
-	if (ep)
-		sctp_endpoint_put(ep);
 	return NULL;
 }
 
 /* Common cleanup code for icmp/icmpv6 error handler. */
-void sctp_err_finish(struct sock *sk, struct sctp_endpoint *ep,
-		     struct sctp_association *asoc)
+void sctp_err_finish(struct sock *sk, struct sctp_association *asoc)
 {
 	sctp_bh_unlock_sock(sk);
 	sock_put(sk);
 	if (asoc)
 		sctp_association_put(asoc);
-	if (ep)
-		sctp_endpoint_put(ep);
 }
 
 /*
@@ -471,7 +451,6 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
 	int type = skb->h.icmph->type;
 	int code = skb->h.icmph->code;
 	struct sock *sk;
-	struct sctp_endpoint *ep;
 	struct sctp_association *asoc;
 	struct sctp_transport *transport;
 	struct inet_sock *inet;
@@ -488,7 +467,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
 	savesctp  = skb->h.raw;
 	skb->nh.iph = iph;
 	skb->h.raw = (char *)sh;
-	sk = sctp_err_lookup(AF_INET, skb, sh, &ep, &asoc, &transport);
+	sk = sctp_err_lookup(AF_INET, skb, sh, &asoc, &transport);
 	/* Put back, the original pointers. */
 	skb->nh.raw = saveip;
 	skb->h.raw = savesctp;
@@ -515,7 +494,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
 		}
 		else {
 			if (ICMP_PROT_UNREACH == code) {
-				sctp_icmp_proto_unreachable(sk, ep, asoc,
+				sctp_icmp_proto_unreachable(sk, asoc,
 							    transport);
 				goto out_unlock;
 			}
@@ -544,7 +523,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
 	}
 
 out_unlock:
-	sctp_err_finish(sk, ep, asoc);
+	sctp_err_finish(sk, asoc);
 }
 
 /*
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index c7e42d125b9c..e9b2fd480d61 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -91,7 +91,6 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	struct ipv6hdr *iph = (struct ipv6hdr *)skb->data;
 	struct sctphdr *sh = (struct sctphdr *)(skb->data + offset);
 	struct sock *sk;
-	struct sctp_endpoint *ep;
 	struct sctp_association *asoc;
 	struct sctp_transport *transport;
 	struct ipv6_pinfo *np;
@@ -105,7 +104,7 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	savesctp  = skb->h.raw;
 	skb->nh.ipv6h = iph;
 	skb->h.raw = (char *)sh;
-	sk = sctp_err_lookup(AF_INET6, skb, sh, &ep, &asoc, &transport);
+	sk = sctp_err_lookup(AF_INET6, skb, sh, &asoc, &transport);
 	/* Put back, the original pointers. */
 	skb->nh.raw = saveip;
 	skb->h.raw = savesctp;
@@ -124,7 +123,7 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		goto out_unlock;
 	case ICMPV6_PARAMPROB:
 		if (ICMPV6_UNK_NEXTHDR == code) {
-			sctp_icmp_proto_unreachable(sk, ep, asoc, transport);
+			sctp_icmp_proto_unreachable(sk, asoc, transport);
 			goto out_unlock;
 		}
 		break;
@@ -142,7 +141,7 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	}
 
 out_unlock:
-	sctp_err_finish(sk, ep, asoc);
+	sctp_err_finish(sk, asoc);
 out:
 	if (likely(idev != NULL))
 		in6_dev_put(idev);
-- 
cgit 


From 54208991e15fa00e37a9d172ac0d87191a832165 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Mon, 18 Jul 2005 13:45:12 -0700
Subject: [NET]: Kconfig: NETCONSOLE and NETPOLL together

Put NETCONSOLE and NETPOLL options together since they are related.
This cuts down on the hassle of flipping back and forth between
the Networking menu and the Network drivers menu to change their
config settings.

Tested with menuconfig, gconfig, and xconfig.
gconfig has a small problem with this.  I think that it's
a bug in gconfig and I will take it up with Romain Lievin.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/Kconfig | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'net')

diff --git a/net/Kconfig b/net/Kconfig
index 2684e809a649..40a31ba86d2c 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -209,22 +209,6 @@ endmenu
 
 endmenu
 
-config NETPOLL
-	def_bool NETCONSOLE
-
-config NETPOLL_RX
-	bool "Netpoll support for trapping incoming packets"
-	default n
-	depends on NETPOLL
-
-config NETPOLL_TRAP
-	bool "Netpoll traffic trapping"
-	default n
-	depends on NETPOLL
-
-config NET_POLL_CONTROLLER
-	def_bool NETPOLL
-
 source "net/ax25/Kconfig"
 source "net/irda/Kconfig"
 source "net/bluetooth/Kconfig"
-- 
cgit 


From 6876f95f201ce2d62367d85dd793c1fee351d7a6 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 18 Jul 2005 13:55:19 -0700
Subject: [IPV4]: fix IP_FIB_HASH kconfig warning

This patch fixes the following kconfig warning:
  net/ipv4/Kconfig:92:warning: defaults for choice values not supported

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index df5386885a90..ecea2bc8229f 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -54,9 +54,9 @@ config IP_ADVANCED_ROUTER
 choice 
 	prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
 	depends on IP_ADVANCED_ROUTER
-	default IP_FIB_HASH
+	default ASK_IP_FIB_HASH
 
-config IP_FIB_HASH
+config ASK_IP_FIB_HASH
 	bool "FIB_HASH"
 	---help---
 	Current FIB is very proven and good enough for most users.
@@ -82,12 +82,8 @@ config IP_FIB_TRIE
        
 endchoice
 
-# If the user does not enable advanced routing, he gets the safe
-# default of the fib-hash algorithm.
 config IP_FIB_HASH
-	bool
-	depends on !IP_ADVANCED_ROUTER
-	default y
+	def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
 
 config IP_MULTIPLE_TABLES
 	bool "IP: policy routing"
-- 
cgit 


From 3f1c81ff10187396e6d0fd01df82ab0cec256c96 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 18 Jul 2005 17:10:55 -0700
Subject: [EMATCH]: Kill TCF_META_ID_TCCLASSID reference from meta ematch as
 well.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/em_meta.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'net')

diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 53d98f8d3d80..feefcbfd03d0 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -250,11 +250,6 @@ META_COLLECTOR(int_tcverd)
 {
 	dst->value = skb->tc_verd;
 }
-
-META_COLLECTOR(int_tcclassid)
-{
-	dst->value = skb->tc_classid;
-}
 #endif
 
 /**************************************************************************
@@ -529,7 +524,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 		[META_ID(TCINDEX)]		= META_FUNC(int_tcindex),
 #ifdef CONFIG_NET_CLS_ACT
 		[META_ID(TCVERDICT)]		= META_FUNC(int_tcverd),
-		[META_ID(TCCLASSID)]		= META_FUNC(int_tcclassid),
 #endif
 #ifdef CONFIG_NET_CLS_ROUTE
 		[META_ID(RTCLASSID)]		= META_FUNC(int_rtclassid),
-- 
cgit 


From 322361b3712af97c19ab730b410f9ae5d9aad4c3 Mon Sep 17 00:00:00 2001
From: Chas Williams <chas@cmf.nrl.navy.mil>
Date: Tue, 19 Jul 2005 13:54:44 -0700
Subject: [ATM]: allow bind() on point-to-multpoint svcs (from Martin Whitaker
 <martin_whitaker@ntlworld.com>)

Signed-off-by: Chas Williams <chas@cmf.nrl.navy.mil>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/atm/svc.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/atm/svc.c b/net/atm/svc.c
index 02f5374a51f2..08e46052a3e4 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -118,10 +118,6 @@ static int svc_bind(struct socket *sock,struct sockaddr *sockaddr,
 		goto out;
 	}
 	vcc = ATM_SD(sock);
-	if (test_bit(ATM_VF_SESSION, &vcc->flags)) {
-		error = -EINVAL;
-		goto out;
-	}
 	addr = (struct sockaddr_atmsvc *) sockaddr;
 	if (addr->sas_family != AF_ATMSVC) {
 		error = -EAFNOSUPPORT;
-- 
cgit 


From 88e9fa8a54cf4d66ee8fc1d855e82de827233d74 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <juhl-lkml@dif.dk>
Date: Tue, 19 Jul 2005 13:56:53 -0700
Subject: [ATM]: Trivial spelling fix patch for net/Kconfig

Signed-off-by: Jesper Juhl <juhl-lkml@dif.dk>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Chas Williams <chas@cmf.nrl.navy.mil>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/atm/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/atm/Kconfig b/net/atm/Kconfig
index bea2426229b1..21ff276b2d80 100644
--- a/net/atm/Kconfig
+++ b/net/atm/Kconfig
@@ -60,7 +60,7 @@ config ATM_BR2684
 	tristate "RFC1483/2684 Bridged protocols"
 	depends on ATM && INET
 	help
-	  ATM PVCs can carry ethernet PDUs according to rfc2684 (formerly 1483)
+	  ATM PVCs can carry ethernet PDUs according to RFC2684 (formerly 1483)
 	  This device will act like an ethernet from the kernels point of view,
 	  with the traffic being carried by ATM PVCs (currently 1 PVC/device).
 	  This is sometimes used over DSL lines.  If in doubt, say N.
@@ -69,6 +69,6 @@ config ATM_BR2684_IPFILTER
 	bool "Per-VC IP filter kludge"
 	depends on ATM_BR2684
 	help
-	  This is an experimental mechanism for users who need to terminating a
+	  This is an experimental mechanism for users who need to terminate a
 	  large number of IP-only vcc's.  Do not enable this unless you are sure
 	  you know what you are doing.
-- 
cgit 


From 6aef4fdfeaec5f2c66415f2cafa98a3ff927501f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 19 Jul 2005 13:58:40 -0700
Subject: [NET]: Only build flow.o if CONFIG_XFRM=y

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/Makefile b/net/core/Makefile
index 5e0c56b7f607..f5f5e58943e8 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -7,9 +7,10 @@ obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
 
 obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
 
-obj-y		     += flow.o dev.o ethtool.o dev_mcast.o dst.o \
+obj-y		     += dev.o ethtool.o dev_mcast.o dst.o \
 			neighbour.o rtnetlink.o utils.o link_watch.o filter.o
 
+obj-$(CONFIG_XFRM) += flow.o
 obj-$(CONFIG_SYSFS) += net-sysfs.o
 obj-$(CONFIG_NETFILTER) += netfilter.o
 obj-$(CONFIG_NET_DIVERT) += dv.o
-- 
cgit 


From abaacad9bcb3f118cc802f527ab5d7c41b63f83a Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 19 Jul 2005 13:59:17 -0700
Subject: [IPV4]: Don't select XFRM for ip_gre

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index ecea2bc8229f..c7f1f9245bc9 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -252,7 +252,6 @@ config NET_IPIP
 
 config NET_IPGRE
 	tristate "IP: GRE tunnels over IP"
-	select XFRM
 	help
 	  Tunneling means encapsulating data of one protocol type within
 	  another protocol and sending it over a channel that understands the
-- 
cgit 


From eb3f8f5e22cdee8138e654963e371ad337830efb Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 19 Jul 2005 14:00:13 -0700
Subject: [NET]: BRIDGE_EBT_ARPREPLY must depend on INET

BRIDGE_EBT_ARPREPLY=y and INET=n results in the following compile error:

net/built-in.o: In function `ebt_target_reply':
ebt_arpreply.c:(.text+0x68fb9): undefined reference to `arp_send'
make: *** [.tmp_vmlinux1] Error 1

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/netfilter/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 68ccef507b49..c70b3be23026 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -138,7 +138,7 @@ config BRIDGE_EBT_VLAN
 #
 config BRIDGE_EBT_ARPREPLY
 	tristate "ebt: arp reply target support"
-	depends on BRIDGE_NF_EBTABLES
+	depends on BRIDGE_NF_EBTABLES && INET
 	help
 	  This option adds the arp reply target, which allows
 	  automatically sending arp replies to arp requests.
-- 
cgit 


From c877efb207bf4629cfa97ac13412f7392a873485 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 19 Jul 2005 14:01:51 -0700
Subject: [IPV4]: Fix up lots of little whitespace indentation stuff in
 fib_trie.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c  |   2 +-
 net/ipv4/fib_trie.c | 772 ++++++++++++++++++++++++++--------------------------
 2 files changed, 388 insertions(+), 386 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index ef7468376ae6..163ae4068b5f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1157,7 +1157,7 @@ static int __init ipv4_proc_init(void)
 #ifdef CONFIG_IP_FIB_TRIE
          if (fib_stat_proc_init())
                  goto out_fib_stat;
- #endif
+#endif
 	if (ip_misc_proc_init())
 		goto out_misc;
 out:
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 4be234c7d8c3..a701405fab0b 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -90,14 +90,14 @@ typedef unsigned int t_key;
 #define T_LEAF  1
 #define NODE_TYPE_MASK	0x1UL
 #define NODE_PARENT(_node) \
-((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
+	((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
 #define NODE_SET_PARENT(_node, _ptr) \
-((_node)->_parent = (((unsigned long)(_ptr)) | \
+	((_node)->_parent = (((unsigned long)(_ptr)) | \
                      ((_node)->_parent & NODE_TYPE_MASK)))
 #define NODE_INIT_PARENT(_node, _type) \
-((_node)->_parent = (_type))
+	((_node)->_parent = (_type))
 #define NODE_TYPE(_node) \
-((_node)->_parent & NODE_TYPE_MASK)
+	((_node)->_parent & NODE_TYPE_MASK)
 
 #define IS_TNODE(n) (!(n->_parent & T_LEAF))
 #define IS_LEAF(n) (n->_parent & T_LEAF)
@@ -147,7 +147,7 @@ struct trie_stat {
 	unsigned int leaves;
 	unsigned int nullpointers;
 	unsigned int nodesizes[MAX_CHILDS];
-};    
+};
 
 struct trie {
         struct node *trie;
@@ -185,9 +185,9 @@ static void trie_bug(char *err)
 	BUG();
 }
 
-static inline struct node *tnode_get_child(struct tnode *tn, int i) 
+static inline struct node *tnode_get_child(struct tnode *tn, int i)
 {
-        if (i >=  1<<tn->bits) 
+        if (i >= 1<<tn->bits)
                 trie_bug("tnode_get_child");
 
         return tn->child[i];
@@ -202,7 +202,7 @@ static inline int tnode_child_length(struct tnode *tn)
   _________________________________________________________________
   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
   ----------------------------------------------------------------
-    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15 
+    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 
   _________________________________________________________________
   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
@@ -226,25 +226,25 @@ static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
 
 static inline int tkey_equals(t_key a, t_key b)
 {
-  return a == b;
+	return a == b;
 }
 
 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 {
-     if (bits == 0 || offset >= KEYLENGTH)
-            return 1;
+	if (bits == 0 || offset >= KEYLENGTH)
+		return 1;
         bits = bits > KEYLENGTH ? KEYLENGTH : bits;
         return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
-}	
+}
 
 static inline int tkey_mismatch(t_key a, int offset, t_key b)
 {
 	t_key diff = a ^ b;
 	int i = offset;
 
-	if(!diff) 
-	  return 0;
-	while((diff << i) >> (KEYLENGTH-1) == 0)
+	if (!diff)
+		return 0;
+	while ((diff << i) >> (KEYLENGTH-1) == 0)
 		i++;
 	return i;
 }
@@ -314,6 +314,7 @@ static void fn_free_alias(struct fib_alias *fa)
   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into 
   n's child array, and will of course be different for each child.
   
+
   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
   at this point.
 
@@ -321,7 +322,7 @@ static void fn_free_alias(struct fib_alias *fa)
 
 static void check_tnode(struct tnode *tn)
 {
-	if(tn && tn->pos+tn->bits > 32) {
+	if (tn && tn->pos+tn->bits > 32) {
 		printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
 	}
 }
@@ -332,7 +333,7 @@ static int inflate_threshold = 50;
 static struct leaf *leaf_new(void)
 {
 	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
-	if(l) {
+	if (l) {
 		NODE_INIT_PARENT(l, T_LEAF);
 		INIT_HLIST_HEAD(&l->list);
 	}
@@ -342,7 +343,7 @@ static struct leaf *leaf_new(void)
 static struct leaf_info *leaf_info_new(int plen)
 {
 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
-	if(li) {
+	if (li) {
 		li->plen = plen;
 		INIT_LIST_HEAD(&li->falh);
 	}
@@ -365,7 +366,7 @@ static struct tnode *tnode_alloc(unsigned int size)
 		return kmalloc(size, GFP_KERNEL);
 	} else {
 		return (struct tnode *)
-		       __get_free_pages(GFP_KERNEL, get_order(size));
+			__get_free_pages(GFP_KERNEL, get_order(size));
 	}
 }
 
@@ -386,7 +387,7 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
 	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
 	struct tnode *tn = tnode_alloc(sz);
 
-	if(tn)  {
+	if (tn)  {
 		memset(tn, 0, sz);
 		NODE_INIT_PARENT(tn, T_TNODE);
 		tn->pos = pos;
@@ -395,7 +396,8 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
 		tn->full_children = 0;
 		tn->empty_children = 1<<bits;
 	}
-	if(trie_debug > 0) 
+
+	if (trie_debug > 0)
 		printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
 		       (unsigned int) (sizeof(struct node) * 1<<bits));
 	return tn;
@@ -403,17 +405,17 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
 
 static void tnode_free(struct tnode *tn)
 {
-	if(!tn) {
+	if (!tn) {
 		trie_bug("tnode_free\n");
 	}
-	if(IS_LEAF(tn)) {
+	if (IS_LEAF(tn)) {
 		free_leaf((struct leaf *)tn);
-		if(trie_debug > 0 ) 
+		if (trie_debug > 0 )
 			printk("FL %p \n", tn);
 	}
-	else if(IS_TNODE(tn)) { 
+	else if (IS_TNODE(tn)) {
 		__tnode_free(tn);
-		if(trie_debug > 0 ) 
+		if (trie_debug > 0 )
 			printk("FT %p \n", tn);
 	}
 	else {
@@ -428,58 +430,58 @@ static void tnode_free(struct tnode *tn)
 
 static inline int tnode_full(struct tnode *tn, struct node *n)
 {
-	if(n == NULL || IS_LEAF(n))
+	if (n == NULL || IS_LEAF(n))
 		return 0;
 
 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 }
 
-static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n) 
+static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
 {
 	tnode_put_child_reorg(tn, i, n, -1);
 }
 
- /* 
+ /*
   * Add a child at position i overwriting the old value.
   * Update the value of full_children and empty_children.
   */
 
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) 
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
 {
 	struct node *chi;
 	int isfull;
 
-	if(i >=  1<<tn->bits) {
+	if (i >= 1<<tn->bits) {
 		printk("bits=%d, i=%d\n", tn->bits, i);
 		trie_bug("tnode_put_child_reorg bits");
 	}
 	write_lock_bh(&fib_lock);
-	chi = tn->child[i];	
+	chi = tn->child[i];
 
 	/* update emptyChildren */
 	if (n == NULL && chi != NULL)
 		tn->empty_children++;
 	else if (n != NULL && chi == NULL)
 		tn->empty_children--;
-  
+
 	/* update fullChildren */
         if (wasfull == -1)
 		wasfull = tnode_full(tn, chi);
 
 	isfull = tnode_full(tn, n);
-	if (wasfull && !isfull) 
+	if (wasfull && !isfull)
 		tn->full_children--;
-	
-	else if (!wasfull && isfull) 
+
+	else if (!wasfull && isfull)
 		tn->full_children++;
-	if(n) 
-		NODE_SET_PARENT(n, tn);	
+	if (n)
+		NODE_SET_PARENT(n, tn);
 
 	tn->child[i] = n;
 	write_unlock_bh(&fib_lock);
 }
 
-static struct node *resize(struct trie *t, struct tnode *tn) 
+static struct node *resize(struct trie *t, struct tnode *tn)
 {
 	int i;
 	int err = 0;
@@ -487,8 +489,8 @@ static struct node *resize(struct trie *t, struct tnode *tn)
  	if (!tn)
 		return NULL;
 
-	if(trie_debug) 
-		printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n", 
+	if (trie_debug)
+		printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 		      tn, inflate_threshold, halve_threshold);
 
 	/* No children */
@@ -505,7 +507,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 
 				/* compress one level */
 				struct node *n = tn->child[i];
-				if(n)
+				if (n)
 					NODE_INIT_PARENT(n, NODE_TYPE(n));
 
 				write_unlock_bh(&fib_lock);
@@ -514,72 +516,72 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 			}
 			write_unlock_bh(&fib_lock);
 		}
-	/* 
+	/*
 	 * Double as long as the resulting node has a number of
 	 * nonempty nodes that are above the threshold.
 	 */
 
 	/*
-	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of 
-	 * the Helsinki University of Technology and Matti Tikkanen of Nokia 
+	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
+	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
 	 * Telecommunications, page 6:
-	 * "A node is doubled if the ratio of non-empty children to all 
+	 * "A node is doubled if the ratio of non-empty children to all
 	 * children in the *doubled* node is at least 'high'."
 	 *
-	 * 'high' in this instance is the variable 'inflate_threshold'. It 
-	 * is expressed as a percentage, so we multiply it with 
-	 * tnode_child_length() and instead of multiplying by 2 (since the 
-	 * child array will be doubled by inflate()) and multiplying 
-	 * the left-hand side by 100 (to handle the percentage thing) we 
+	 * 'high' in this instance is the variable 'inflate_threshold'. It
+	 * is expressed as a percentage, so we multiply it with
+	 * tnode_child_length() and instead of multiplying by 2 (since the
+	 * child array will be doubled by inflate()) and multiplying
+	 * the left-hand side by 100 (to handle the percentage thing) we
 	 * multiply the left-hand side by 50.
-	 * 
-	 * The left-hand side may look a bit weird: tnode_child_length(tn) 
-	 * - tn->empty_children is of course the number of non-null children 
-	 * in the current node. tn->full_children is the number of "full" 
+	 *
+	 * The left-hand side may look a bit weird: tnode_child_length(tn)
+	 * - tn->empty_children is of course the number of non-null children
+	 * in the current node. tn->full_children is the number of "full"
 	 * children, that is non-null tnodes with a skip value of 0.
-	 * All of those will be doubled in the resulting inflated tnode, so 
+	 * All of those will be doubled in the resulting inflated tnode, so
 	 * we just count them one extra time here.
-	 * 
+	 *
 	 * A clearer way to write this would be:
-	 * 
+	 *
 	 * to_be_doubled = tn->full_children;
-	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - 
+	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 	 *     tn->full_children;
 	 *
 	 * new_child_length = tnode_child_length(tn) * 2;
 	 *
-	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / 
+	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 	 *      new_child_length;
 	 * if (new_fill_factor >= inflate_threshold)
-	 * 
-	 * ...and so on, tho it would mess up the while() loop.
-	 * 
+	 *
+	 * ...and so on, tho it would mess up the while () loop.
+	 *
 	 * anyway,
 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 	 *      inflate_threshold
-	 * 
+	 *
 	 * avoid a division:
 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 	 *      inflate_threshold * new_child_length
-	 * 
+	 *
 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
-	 * 100 * (tnode_child_length(tn) - tn->empty_children + 
+	 * 100 * (tnode_child_length(tn) - tn->empty_children +
 	 *    tn->full_children ) >= inflate_threshold * new_child_length
-	 * 
+	 *
 	 * expand new_child_length:
-	 * 100 * (tnode_child_length(tn) - tn->empty_children + 
+	 * 100 * (tnode_child_length(tn) - tn->empty_children +
 	 *    tn->full_children ) >=
 	 *      inflate_threshold * tnode_child_length(tn) * 2
-	 * 
+	 *
 	 * shorten again:
-	 * 50 * (tn->full_children + tnode_child_length(tn) - 
-	 *    tn->empty_children ) >= inflate_threshold * 
+	 * 50 * (tn->full_children + tnode_child_length(tn) -
+	 *    tn->empty_children ) >= inflate_threshold *
 	 *    tnode_child_length(tn)
-	 * 
+	 *
 	 */
 
 	check_tnode(tn);
-	
+
 	err = 0;
 	while ((tn->full_children > 0 &&
 	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
@@ -587,7 +589,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 
 		tn = inflate(t, tn, &err);
 
-		if(err) {
+		if (err) {
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			t->stats.resize_node_skipped++;
 #endif
@@ -609,7 +611,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 
 		tn = halve(t, tn, &err);
 
-		if(err) {
+		if (err) {
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			t->stats.resize_node_skipped++;
 #endif
@@ -617,18 +619,18 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 		}
 	}
 
-  
+
 	/* Only one child remains */
 
 	if (tn->empty_children == tnode_child_length(tn) - 1)
 		for (i = 0; i < tnode_child_length(tn); i++) {
-			
+		
 			write_lock_bh(&fib_lock);
 			if (tn->child[i] != NULL) {
 				/* compress one level */
 				struct node *n = tn->child[i];
 
-				if(n)
+				if (n)
 					NODE_INIT_PARENT(n, NODE_TYPE(n));
 
 				write_unlock_bh(&fib_lock);
@@ -648,7 +650,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
 	int olen = tnode_child_length(tn);
 	int i;
 
-  	if(trie_debug) 
+  	if (trie_debug)
 		printk("In inflate\n");
 
 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
@@ -659,12 +661,12 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
 	}
 
 	/*
-	 * Preallocate and store tnodes before the actual work so we 
-	 * don't get into an inconsistent state if memory allocation 
-	 * fails. In case of failure we return the oldnode and  inflate 
+	 * Preallocate and store tnodes before the actual work so we
+	 * don't get into an inconsistent state if memory allocation
+	 * fails. In case of failure we return the oldnode and  inflate
 	 * of tnode is ignored.
 	 */
-			
+		
 	for(i = 0; i < olen; i++) {
 		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
 
@@ -675,20 +677,20 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
 			struct tnode *left, *right;
 
 			t_key m = TKEY_GET_MASK(inode->pos, 1);
- 
+
 			left = tnode_new(inode->key&(~m), inode->pos + 1,
 					 inode->bits - 1);
 
-			if(!left) {
-				*err = -ENOMEM; 
+			if (!left) {
+				*err = -ENOMEM;
 				break;
 			}
-			
+		
 			right = tnode_new(inode->key|m, inode->pos + 1,
 					  inode->bits - 1);
 
-			if(!right) {
-				*err = -ENOMEM; 
+			if (!right) {
+				*err = -ENOMEM;
 				break;
 			}
 
@@ -697,32 +699,32 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
 		}
 	}
 
-	if(*err) {
+	if (*err) {
 		int size = tnode_child_length(tn);
 		int j;
 
-		for(j = 0; j < size; j++) 
-			if( tn->child[j])
+		for(j = 0; j < size; j++)
+			if (tn->child[j])
 				tnode_free((struct tnode *)tn->child[j]);
 
 		tnode_free(tn);
-		
+	
 		*err = -ENOMEM;
 		return oldtnode;
 	}
 
 	for(i = 0; i < olen; i++) {
 		struct node *node = tnode_get_child(oldtnode, i);
-      
+
 		/* An empty child */
 		if (node == NULL)
 			continue;
 
 		/* A leaf or an internal node with skipped bits */
 
-		if(IS_LEAF(node) || ((struct tnode *) node)->pos >
+		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
 		   tn->pos + tn->bits - 1) {
-			if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
+			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
 					     1) == 0)
 				put_child(t, tn, 2*i, node);
 			else
@@ -745,37 +747,37 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
 			struct tnode *left, *right;
 			int size, j;
 
-			/* We will replace this node 'inode' with two new 
-			 * ones, 'left' and 'right', each with half of the 
-			 * original children. The two new nodes will have 
-			 * a position one bit further down the key and this 
-			 * means that the "significant" part of their keys 
-			 * (see the discussion near the top of this file) 
-			 * will differ by one bit, which will be "0" in 
-			 * left's key and "1" in right's key. Since we are 
-			 * moving the key position by one step, the bit that 
-			 * we are moving away from - the bit at position 
-			 * (inode->pos) - is the one that will differ between 
+			/* We will replace this node 'inode' with two new
+			 * ones, 'left' and 'right', each with half of the
+			 * original children. The two new nodes will have
+			 * a position one bit further down the key and this
+			 * means that the "significant" part of their keys
+			 * (see the discussion near the top of this file)
+			 * will differ by one bit, which will be "0" in
+			 * left's key and "1" in right's key. Since we are
+			 * moving the key position by one step, the bit that
+			 * we are moving away from - the bit at position
+			 * (inode->pos) - is the one that will differ between
 			 * left and right. So... we synthesize that bit in the
 			 * two  new keys.
-			 * The mask 'm' below will be a single "one" bit at 
+			 * The mask 'm' below will be a single "one" bit at
 			 * the position (inode->pos)
 			 */
 
-			/* Use the old key, but set the new significant 
-			 *   bit to zero. 
+			/* Use the old key, but set the new significant
+			 *   bit to zero.
 			 */
 
 			left = (struct tnode *) tnode_get_child(tn, 2*i);
 			put_child(t, tn, 2*i, NULL);
 
-			if(!left)
+			if (!left)
 				BUG();
 
 			right = (struct tnode *) tnode_get_child(tn, 2*i+1);
 			put_child(t, tn, 2*i+1, NULL);
 
-			if(!right)
+			if (!right)
 				BUG();
 
 			size = tnode_child_length(left);
@@ -800,9 +802,9 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
 	int i;
 	int olen = tnode_child_length(tn);
 
-	if(trie_debug) printk("In halve\n");
-  
-	tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
+	if (trie_debug) printk("In halve\n");
+
+	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 
 	if (!tn) {
 		*err = -ENOMEM;
@@ -810,39 +812,39 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
 	}
 
 	/*
-	 * Preallocate and store tnodes before the actual work so we 
-	 * don't get into an inconsistent state if memory allocation 
-	 * fails. In case of failure we return the oldnode and halve 
+	 * Preallocate and store tnodes before the actual work so we
+	 * don't get into an inconsistent state if memory allocation
+	 * fails. In case of failure we return the oldnode and halve
 	 * of tnode is ignored.
 	 */
 
 	for(i = 0; i < olen; i += 2) {
 		left = tnode_get_child(oldtnode, i);
 		right = tnode_get_child(oldtnode, i+1);
-    
+
 		/* Two nonempty children */
-		if( left && right)  {
+		if (left && right)  {
 			struct tnode *newBinNode =
 				tnode_new(left->key, tn->pos + tn->bits, 1);
 
-			if(!newBinNode) {
-				*err = -ENOMEM; 
+			if (!newBinNode) {
+				*err = -ENOMEM;
 				break;
 			}
 			put_child(t, tn, i/2, (struct node *)newBinNode);
 		}
 	}
 
-	if(*err) {
+	if (*err) {
 		int size = tnode_child_length(tn);
 		int j;
 
-		for(j = 0; j < size; j++) 
-			if( tn->child[j])
+		for(j = 0; j < size; j++)
+			if (tn->child[j])
 				tnode_free((struct tnode *)tn->child[j]);
 
 		tnode_free(tn);
-		
+	
 		*err = -ENOMEM;
 		return oldtnode;
 	}
@@ -850,7 +852,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
 	for(i = 0; i < olen; i += 2) {
 		left = tnode_get_child(oldtnode, i);
 		right = tnode_get_child(oldtnode, i+1);
-    
+
 		/* At least one of the children is empty */
 		if (left == NULL) {
 			if (right == NULL)    /* Both are empty */
@@ -858,14 +860,14 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
 			put_child(t, tn, i/2, right);
 		} else if (right == NULL)
 			put_child(t, tn, i/2, left);
-     
+
 		/* Two nonempty children */
 		else {
 			struct tnode *newBinNode =
 				(struct tnode *) tnode_get_child(tn, i/2);
 			put_child(t, tn, i/2, NULL);
 
-			if(!newBinNode) 
+			if (!newBinNode)
 				BUG();
 
 			put_child(t, newBinNode, 0, left);
@@ -879,7 +881,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
 
 static void *trie_init(struct trie *t)
 {
-	if(t) {
+	if (t) {
 		t->size = 0;
 		t->trie = NULL;
 		t->revision = 0;
@@ -896,8 +898,7 @@ static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
 	struct leaf_info *li;
 
 	hlist_for_each_entry(li, node, head, hlist) {
-		  
-		if ( li->plen == plen )
+		if (li->plen == plen)
 			return li;
 	}
 	return NULL;
@@ -905,35 +906,35 @@ static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
 
 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
 {
-	struct list_head *fa_head=NULL;
+	struct list_head *fa_head = NULL;
 	struct leaf_info *li = find_leaf_info(&l->list, plen);
-	
-	if(li) 
+
+	if (li)
 		fa_head = &li->falh;
-	
+
 	return fa_head;
 }
 
 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 {
-	struct leaf_info *li=NULL, *last=NULL;
+	struct leaf_info *li = NULL, *last = NULL;
 	struct hlist_node *node, *tmp;
 
 	write_lock_bh(&fib_lock);
-	
-	if(hlist_empty(head))
+
+	if (hlist_empty(head))
 		hlist_add_head(&new->hlist, head);
 	else {
 		hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
-			
-			if (new->plen > li->plen) 
+		
+			if (new->plen > li->plen)
 				break;
-			
+		
 			last = li;
 		}
-		if(last) 
+		if (last)
 			hlist_add_after(&last->hlist, &new->hlist);
-		else 
+		else
 			hlist_add_before(&new->hlist, &li->hlist);
 	}
 	write_unlock_bh(&fib_lock);
@@ -947,14 +948,14 @@ fib_find_node(struct trie *t, u32 key)
 	struct node *n;
 
 	pos = 0;
-	n=t->trie;
+	n = t->trie;
 
 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 		tn = (struct tnode *) n;
-			
+		
 		check_tnode(tn);
-			
-		if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
+		
+		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 			pos=tn->pos + tn->bits;
 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
 		}
@@ -977,23 +978,23 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
 	t_key cindex, key;
 	struct tnode *tp = NULL;
 
-	if(!tn) 
+	if (!tn)
 		BUG();
-	
+
 	key = tn->key;
 	i = 0;
 
 	while (tn != NULL && NODE_PARENT(tn) != NULL) {
 
-		if( i > 10 ) {
+		if (i > 10) {
 			printk("Rebalance tn=%p \n", tn);
-			if(tn) 		printk("tn->parent=%p \n", NODE_PARENT(tn));
-			
+			if (tn) 		printk("tn->parent=%p \n", NODE_PARENT(tn));
+		
 			printk("Rebalance tp=%p \n", tp);
-			if(tp) 		printk("tp->parent=%p \n", NODE_PARENT(tp));
+			if (tp) 		printk("tp->parent=%p \n", NODE_PARENT(tp));
 		}
 
-		if( i > 12 ) BUG();
+		if (i > 12) BUG();
 		i++;
 
 		tp = NODE_PARENT(tn);
@@ -1001,14 +1002,14 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
 		tn = (struct tnode *) resize (t, (struct tnode *)tn);
 		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
-		
-		if(!NODE_PARENT(tn))
+	
+		if (!NODE_PARENT(tn))
 			break;
 
 		tn = NODE_PARENT(tn);
 	}
 	/* Handle last (top) tnode */
-	if (IS_TNODE(tn)) 
+	if (IS_TNODE(tn))
 		tn = (struct tnode*) resize(t, (struct tnode *)tn);
 
 	return (struct node*) tn;
@@ -1022,42 +1023,42 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 	struct node *n;
 	struct leaf *l;
 	int missbit;
-	struct list_head *fa_head=NULL;
+	struct list_head *fa_head = NULL;
 	struct leaf_info *li;
 	t_key cindex;
 
 	pos = 0;
-	n=t->trie;
+	n = t->trie;
 
-	/* If we point to NULL, stop. Either the tree is empty and we should 
-	 * just put a new leaf in if, or we have reached an empty child slot, 
+	/* If we point to NULL, stop. Either the tree is empty and we should
+	 * just put a new leaf in if, or we have reached an empty child slot,
 	 * and we should just put our new leaf in that.
-	 * If we point to a T_TNODE, check if it matches our key. Note that 
-	 * a T_TNODE might be skipping any number of bits - its 'pos' need 
+	 * If we point to a T_TNODE, check if it matches our key. Note that
+	 * a T_TNODE might be skipping any number of bits - its 'pos' need
 	 * not be the parent's 'pos'+'bits'!
 	 *
-	 * If it does match the current key, get pos/bits from it, extract 
+	 * If it does match the current key, get pos/bits from it, extract
 	 * the index from our key, push the T_TNODE and walk the tree.
 	 *
 	 * If it doesn't, we have to replace it with a new T_TNODE.
 	 *
-	 * If we point to a T_LEAF, it might or might not have the same key 
-	 * as we do. If it does, just change the value, update the T_LEAF's 
-	 * value, and return it. 
+	 * If we point to a T_LEAF, it might or might not have the same key
+	 * as we do. If it does, just change the value, update the T_LEAF's
+	 * value, and return it.
 	 * If it doesn't, we need to replace it with a T_TNODE.
 	 */
 
 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 		tn = (struct tnode *) n;
-			
-		check_tnode(tn);
 		
-		if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
+		check_tnode(tn);
+	
+		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 			tp = tn;
 			pos=tn->pos + tn->bits;
 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
 
-			if(n && NODE_PARENT(n) != tn) {
+			if (n && NODE_PARENT(n) != tn) {
 				printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
 				BUG();
 			}
@@ -1069,21 +1070,21 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 	/*
 	 * n  ----> NULL, LEAF or TNODE
 	 *
-	 * tp is n's (parent) ----> NULL or TNODE  
+	 * tp is n's (parent) ----> NULL or TNODE
 	 */
 
-	if(tp && IS_LEAF(tp))
+	if (tp && IS_LEAF(tp))
 		BUG();
 
 
 	/* Case 1: n is a leaf. Compare prefixes */
 
-	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { 
+	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
 		struct leaf *l = ( struct leaf *)  n;
-		
+	
 		li = leaf_info_new(plen);
-		
-		if(! li) {
+	
+		if (!li) {
 			*err = -ENOMEM;
 			goto err;
 		}
@@ -1095,7 +1096,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 	t->size++;
 	l = leaf_new();
 
-	if(! l) {
+	if (!l) {
 		*err = -ENOMEM;
 		goto err;
 	}
@@ -1103,7 +1104,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 	l->key = key;
 	li = leaf_info_new(plen);
 
-	if(! li) {
+	if (!li) {
 		tnode_free((struct tnode *) l);
 		*err = -ENOMEM;
 		goto err;
@@ -1116,8 +1117,8 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 	if (t->trie && n == NULL) {
 
 		NODE_SET_PARENT(l, tp);
-		
-		if (!tp) 
+	
+		if (!tp)
 			BUG();
 
 		else {
@@ -1127,8 +1128,8 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 	}
 	/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
 	else {
-		/* 
-		 *  Add a new tnode here 
+		/*
+		 *  Add a new tnode here
 		 *  first tnode need some special handling
 		 */
 
@@ -1136,39 +1137,39 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 			pos=tp->pos+tp->bits;
 		else
 			pos=0;
-		if(n) {
+		if (n) {
 			newpos = tkey_mismatch(key, pos, n->key);
 			tn = tnode_new(n->key, newpos, 1);
 		}
 		else {
 			newpos = 0;
-			tn = tnode_new(key, newpos, 1); /* First tnode */ 
+			tn = tnode_new(key, newpos, 1); /* First tnode */
 		}
 
-		if(!tn) {
+		if (!tn) {
 			free_leaf_info(li);
 			tnode_free((struct tnode *) l);
 			*err = -ENOMEM;
 			goto err;
-		}			
-			
+		}		
+		
 		NODE_SET_PARENT(tn, tp);
 
 		missbit=tkey_extract_bits(key, newpos, 1);
 		put_child(t, tn, missbit, (struct node *)l);
 		put_child(t, tn, 1-missbit, n);
 
-		if(tp) {
+		if (tp) {
 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
 		}
-		else { 
+		else {
 			t->trie = (struct node*) tn; /* First tnode */
 			tp = tn;
 		}
 	}
-	if(tp && tp->pos+tp->bits > 32) {
-		printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", 
+	if (tp && tp->pos+tp->bits > 32) {
+		printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
 		       tp, tp->pos, tp->bits, key, plen);
 	}
 	/* Rebalance the trie */
@@ -1185,7 +1186,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 {
 	struct trie *t = (struct trie *) tb->tb_data;
 	struct fib_alias *fa, *new_fa;
-	struct list_head *fa_head=NULL;
+	struct list_head *fa_head = NULL;
 	struct fib_info *fi;
 	int plen = r->rtm_dst_len;
 	int type = r->rtm_type;
@@ -1198,17 +1199,17 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 		return -EINVAL;
 
 	key = 0;
-	if (rta->rta_dst) 
+	if (rta->rta_dst)
 		memcpy(&key, rta->rta_dst, 4);
 
 	key = ntohl(key);
 
-	if(trie_debug)
+	if (trie_debug)
 		printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
 
-	mask =  ntohl( inet_make_mask(plen) );
+	mask = ntohl( inet_make_mask(plen) );
 
-	if(key & ~mask)
+	if (key & ~mask)
 		return -EINVAL;
 
 	key = key & mask;
@@ -1217,9 +1218,9 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 		goto err;
 
 	l = fib_find_node(t, key);
-	fa = NULL;	
+	fa = NULL;
 
-	if(l) {
+	if (l) {
 		fa_head = get_fa_head(l, plen);
 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
 	}
@@ -1298,16 +1299,16 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 	new_fa->fa_scope = r->rtm_scope;
 	new_fa->fa_state = 0;
 #if 0
-	new_fa->dst  = NULL;
+	new_fa->dst = NULL;
 #endif
 	/*
 	 * Insert new entry to the list.
 	 */
 
-	if(!fa_head) {
+	if (!fa_head) {
 		fa_head = fib_insert_node(t, &err, key, plen);
 		err = 0;
-		if(err) 
+		if (err)
 			goto out_free_new_fa;
 	}
 
@@ -1327,11 +1328,11 @@ out_free_new_fa:
 	kmem_cache_free(fn_alias_kmem, new_fa);
 out:
 	fib_release_info(fi);
-err:;	
+err:;
 	return err;
 }
 
-static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *plen, const struct flowi *flp, 
+static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *plen, const struct flowi *flp,
 			     struct fib_result *res, int *err)
 {
 	int i;
@@ -1339,12 +1340,12 @@ static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *pl
 	struct leaf_info *li;
 	struct hlist_head *hhead = &l->list;
 	struct hlist_node *node;
-	
+
 	hlist_for_each_entry(li, node, hhead, hlist) {
 
 		i = li->plen;
 		mask = ntohl(inet_make_mask(i));
-		if (l->key != (key & mask)) 
+		if (l->key != (key & mask))
 			continue;
 
 		if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
@@ -1376,7 +1377,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
 	n = t->trie;
 
 	read_lock(&fib_lock);
-	if(!n)
+	if (!n)
 		goto failed;
 
 #ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -1385,19 +1386,19 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
 
 	/* Just a leaf? */
 	if (IS_LEAF(n)) {
-		if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) )
+		if (check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
 			goto found;
 		goto failed;
 	}
 	pn = (struct tnode *) n;
 	chopped_off = 0;
-	
+
         while (pn) {
 
 		pos = pn->pos;
 		bits = pn->bits;
 
-		if(!chopped_off) 
+		if (!chopped_off)
 			cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
 
 		n = tnode_get_child(pn, cindex);
@@ -1417,33 +1418,33 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
 			int mp;
 
 			/*
-			 * It's a tnode, and we can do some extra checks here if we 
+			 * It's a tnode, and we can do some extra checks here if we
 			 * like, to avoid descending into a dead-end branch.
-			 * This tnode is in the parent's child array at index 
-			 * key[p_pos..p_pos+p_bits] but potentially with some bits 
-			 * chopped off, so in reality the index may be just a 
+			 * This tnode is in the parent's child array at index
+			 * key[p_pos..p_pos+p_bits] but potentially with some bits
+			 * chopped off, so in reality the index may be just a
 			 * subprefix, padded with zero at the end.
-			 * We can also take a look at any skipped bits in this 
-			 * tnode - everything up to p_pos is supposed to be ok, 
+			 * We can also take a look at any skipped bits in this
+			 * tnode - everything up to p_pos is supposed to be ok,
 			 * and the non-chopped bits of the index (se previous
-			 * paragraph) are also guaranteed ok, but the rest is 
+			 * paragraph) are also guaranteed ok, but the rest is
 			 * considered unknown.
 			 *
 			 * The skipped bits are key[pos+bits..cn->pos].
 			 */
-			
-			/* If current_prefix_length < pos+bits, we are already doing 
-			 * actual prefix  matching, which means everything from 
-			 * pos+(bits-chopped_off) onward must be zero along some 
-			 * branch of this subtree - otherwise there is *no* valid 
+		
+			/* If current_prefix_length < pos+bits, we are already doing
+			 * actual prefix  matching, which means everything from
+			 * pos+(bits-chopped_off) onward must be zero along some
+			 * branch of this subtree - otherwise there is *no* valid
 			 * prefix present. Here we can only check the skipped
-			 * bits. Remember, since we have already indexed into the 
-			 * parent's child array, we know that the bits we chopped of 
+			 * bits. Remember, since we have already indexed into the
+			 * parent's child array, we know that the bits we chopped of
 			 * *are* zero.
 			 */
 
 			/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
-			
+		
 			if (current_prefix_length < pos+bits) {
 				if (tkey_extract_bits(cn->key, current_prefix_length,
 						      cn->pos - current_prefix_length) != 0 ||
@@ -1452,13 +1453,13 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
 			}
 
 			/*
-			 * If chopped_off=0, the index is fully validated and we 
-			 * only need to look at the skipped bits for this, the new, 
+			 * If chopped_off=0, the index is fully validated and we
+			 * only need to look at the skipped bits for this, the new,
 			 * tnode. What we actually want to do is to find out if
 			 * these skipped bits match our key perfectly, or if we will
-			 * have to count on finding a matching prefix further down, 
-			 * because if we do, we would like to have some way of 
-			 * verifying the existence of such a prefix at this point. 
+			 * have to count on finding a matching prefix further down,
+			 * because if we do, we would like to have some way of
+			 * verifying the existence of such a prefix at this point.
 			 */
 
 			/* The only thing we can do at this point is to verify that
@@ -1470,22 +1471,22 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
 			 * new tnode's key.
 			 */
 
-			/* Note: We aren't very concerned about the piece of the key 
-			 * that precede pn->pos+pn->bits, since these have already been 
-			 * checked. The bits after cn->pos aren't checked since these are 
-			 * by definition "unknown" at this point. Thus, what we want to 
-			 * see is if we are about to enter the "prefix matching" state, 
-			 * and in that case verify that the skipped bits that will prevail 
-			 * throughout this subtree are zero, as they have to be if we are 
+			/* Note: We aren't very concerned about the piece of the key
+			 * that precede pn->pos+pn->bits, since these have already been
+			 * checked. The bits after cn->pos aren't checked since these are
+			 * by definition "unknown" at this point. Thus, what we want to
+			 * see is if we are about to enter the "prefix matching" state,
+			 * and in that case verify that the skipped bits that will prevail
+			 * throughout this subtree are zero, as they have to be if we are
 			 * to find a matching prefix.
 			 */
 
 			node_prefix = MASK_PFX(cn->key, cn->pos);
-			key_prefix =  MASK_PFX(key, cn->pos);
+			key_prefix = MASK_PFX(key, cn->pos);
 			pref_mismatch = key_prefix^node_prefix;
 			mp = 0;
 
-			/* In short: If skipped bits in this node do not match the search 
+			/* In short: If skipped bits in this node do not match the search
 			 * key, enter the "prefix matching" state.directly.
 			 */
 			if (pref_mismatch) {
@@ -1494,7 +1495,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
 					pref_mismatch = pref_mismatch <<1;
 				}
 				key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
-				
+			
 				if (key_prefix != 0)
 					goto backtrace;
 
@@ -1505,9 +1506,9 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
 		       pn = (struct tnode *)n; /* Descend */
 		       chopped_off = 0;
 		       continue;
-		} 
-		if (IS_LEAF(n)) {	
-			if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
+		}
+		if (IS_LEAF(n)) {
+			if (check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
 				goto found;
 	       }
 backtrace:
@@ -1521,18 +1522,18 @@ backtrace:
 		/* Decrease current_... with bits chopped off */
 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
 			current_prefix_length = pn->pos + pn->bits - chopped_off;
-		
+	
 		/*
-		 * Either we do the actual chop off according or if we have 
+		 * Either we do the actual chop off according or if we have
 		 * chopped off all bits in this tnode walk up to our parent.
 		 */
 
-		if(chopped_off <= pn->bits)
+		if (chopped_off <= pn->bits)
 			cindex &= ~(1 << (chopped_off-1));
 		else {
-			if( NODE_PARENT(pn) == NULL)
+			if (NODE_PARENT(pn) == NULL)
 				goto failed;
-			
+		
 			/* Get Child's index */
 			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
 			pn = NODE_PARENT(pn);
@@ -1542,10 +1543,10 @@ backtrace:
 			t->stats.backtrack++;
 #endif
 			goto backtrace;
-		} 
+		}
 	}
 failed:
-	ret =  1;
+	ret = 1;
 found:
 	read_unlock(&fib_lock);
 	return ret;
@@ -1558,11 +1559,11 @@ static int trie_leaf_remove(struct trie *t, t_key key)
 	struct node *n = t->trie;
 	struct leaf *l;
 
-	if(trie_debug) 
+	if (trie_debug)
 		printk("entering trie_leaf_remove(%p)\n", n);
 
 	/* Note that in the case skipped bits, those bits are *not* checked!
-	 * When we finish this, we will have NULL or a T_LEAF, and the 
+	 * When we finish this, we will have NULL or a T_LEAF, and the
 	 * T_LEAF may or may not match our key.
 	 */
 
@@ -1571,19 +1572,19 @@ static int trie_leaf_remove(struct trie *t, t_key key)
 		check_tnode(tn);
 		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
 
-			if(n && NODE_PARENT(n) != tn) {
+			if (n && NODE_PARENT(n) != tn) {
 				printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
 				BUG();
 			}
         }
 	l = (struct leaf *) n;
 
-	if(!n || !tkey_equals(l->key, key)) 
+	if (!n || !tkey_equals(l->key, key))
 		return 0;
-    
-	/* 
-	 * Key found. 
-	 * Remove the leaf and rebalance the tree 
+
+	/*
+	 * Key found.
+	 * Remove the leaf and rebalance the tree
 	 */
 
 	t->revision++;
@@ -1592,7 +1593,7 @@ static int trie_leaf_remove(struct trie *t, t_key key)
 	tp = NODE_PARENT(n);
 	tnode_free((struct tnode *) n);
 
-	if(tp) {
+	if (tp) {
 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 		put_child(t, (struct tnode *)tp, cindex, NULL);
 		t->trie = trie_rebalance(t, tp);
@@ -1615,23 +1616,23 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 	struct list_head *fa_head;
 	struct leaf *l;
 
-	if (plen > 32) 
+	if (plen > 32)
 		return -EINVAL;
 
 	key = 0;
-	if (rta->rta_dst) 
+	if (rta->rta_dst)
 		memcpy(&key, rta->rta_dst, 4);
 
 	key = ntohl(key);
-	mask =  ntohl( inet_make_mask(plen) );
+	mask = ntohl( inet_make_mask(plen) );
 
-	if(key & ~mask)
+	if (key & ~mask)
 		return -EINVAL;
 
 	key = key & mask;
 	l = fib_find_node(t, key);
 
-	if(!l)
+	if (!l)
 		return -ESRCH;
 
 	fa_head = get_fa_head(l, plen);
@@ -1677,16 +1678,16 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 
 		list_del(&fa->fa_list);
 
-		if(list_empty(fa_head)) {
+		if (list_empty(fa_head)) {
 			hlist_del(&li->hlist);
 			kill_li = 1;
 		}
 		write_unlock_bh(&fib_lock);
-		
-		if(kill_li)
+	
+		if (kill_li)
 			free_leaf_info(li);
 
-		if(hlist_empty(&l->list))
+		if (hlist_empty(&l->list))
 			trie_leaf_remove(t, key);
 
 		if (fa->fa_state & FA_S_ACCESSED)
@@ -1705,12 +1706,12 @@ static int trie_flush_list(struct trie *t, struct list_head *head)
 
 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
 		struct fib_info *fi = fa->fa_info;
-		
+	
 		if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
 
- 			write_lock_bh(&fib_lock);	
+ 			write_lock_bh(&fib_lock);
 			list_del(&fa->fa_list);
-			write_unlock_bh(&fib_lock); 
+			write_unlock_bh(&fib_lock);
 
 			fn_free_alias(fa);
 			found++;
@@ -1727,14 +1728,14 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l)
 	struct leaf_info *li = NULL;
 
 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
-			
+		
 		found += trie_flush_list(t, &li->falh);
 
 		if (list_empty(&li->falh)) {
 
- 			write_lock_bh(&fib_lock); 
+ 			write_lock_bh(&fib_lock);
 			hlist_del(&li->hlist);
-			write_unlock_bh(&fib_lock); 
+			write_unlock_bh(&fib_lock);
 
 			free_leaf_info(li);
 		}
@@ -1748,8 +1749,8 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
 	struct tnode *p;
 	int idx;
 
-	if(c == NULL) {
-		if(t->trie == NULL)
+	if (c == NULL) {
+		if (t->trie == NULL)
 			return NULL;
 
 		if (IS_LEAF(t->trie))          /* trie w. just a leaf */
@@ -1757,33 +1758,34 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
 
 		p = (struct tnode*) t->trie;  /* Start */
 	}
-	else 
+	else
 		p = (struct tnode *) NODE_PARENT(c);
+
 	while (p) {
 		int pos, last;
 
 		/*  Find the next child of the parent */
-		if(c)
-			pos  = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
-		else 
+		if (c)
+			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
+		else
 			pos = 0;
 
 		last = 1 << p->bits;
 		for(idx = pos; idx < last ; idx++) {
-			if( p->child[idx]) {
+			if (p->child[idx]) {
 
 				/* Decend if tnode */
 
 				while (IS_TNODE(p->child[idx])) {
 					p = (struct tnode*) p->child[idx];
 					idx = 0;
-					
+				
 					/* Rightmost non-NULL branch */
-					if( p && IS_TNODE(p) )
-						while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++;
+					if (p && IS_TNODE(p))
+						while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
 
 					/* Done with this tnode? */
-					if( idx >= (1 << p->bits) || p->child[idx] == NULL ) 
+					if (idx >= (1 << p->bits) || p->child[idx] == NULL )
 						goto up;
 				}
 				return (struct leaf*) p->child[idx];
@@ -1816,7 +1818,7 @@ static int fn_trie_flush(struct fib_table *tb)
 	if (ll && hlist_empty(&ll->list))
 		trie_leaf_remove(t, ll->key);
 
-	if(trie_debug) 
+	if (trie_debug)
 		printk("trie_flush found=%d\n", found);
 	return found;
 }
@@ -1839,32 +1841,32 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
 	order = -1;
 
 	read_lock(&fib_lock);
-	
+
 	l = fib_find_node(t, 0);
-	if(!l) 
+	if (!l)
 		goto out;
 
 	fa_head = get_fa_head(l, 0);
-	if(!fa_head)
+	if (!fa_head)
 		goto out;
 
-	if (list_empty(fa_head)) 
+	if (list_empty(fa_head))
 		goto out;
 
 	list_for_each_entry(fa, fa_head, fa_list) {
 		struct fib_info *next_fi = fa->fa_info;
-		
+	
 		if (fa->fa_scope != res->scope ||
 		    fa->fa_type != RTN_UNICAST)
 			continue;
-		
+	
 		if (next_fi->fib_priority > res->fi->fib_priority)
 			break;
 		if (!next_fi->fib_nh[0].nh_gw ||
 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
 			continue;
 		fa->fa_state |= FA_S_ACCESSED;
-		
+	
 		if (fi == NULL) {
 			if (next_fi != res->fi)
 				break;
@@ -1902,10 +1904,10 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
 	}
 	trie_last_dflt = last_idx;
  out:;
-	read_unlock(&fib_lock);	
+	read_unlock(&fib_lock);
 }
 
-static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, 
+static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
 			   struct sk_buff *skb, struct netlink_callback *cb)
 {
 	int i, s_i;
@@ -1951,7 +1953,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
 	return skb->len;
 }
 
-static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb, 
+static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
 			     struct netlink_callback *cb)
 {
 	int h, s_h;
@@ -1968,11 +1970,11 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
 			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
 
 		fa_head = get_fa_head(l, plen);
-		
-		if(!fa_head)
+	
+		if (!fa_head)
 			continue;
 
-		if(list_empty(fa_head))
+		if (list_empty(fa_head))
 			continue;
 
 		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
@@ -2048,10 +2050,10 @@ struct fib_table * __init fib_hash_init(int id)
 
 	trie_init(t);
 
-	if (id == RT_TABLE_LOCAL) 
-                trie_local=t;
-	  else if (id == RT_TABLE_MAIN) 
-                trie_main=t;
+	if (id == RT_TABLE_LOCAL)
+                trie_local = t;
+	else if (id == RT_TABLE_MAIN)
+                trie_main = t;
 
 	if (id == RT_TABLE_LOCAL)
 		printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
@@ -2072,7 +2074,7 @@ static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
 		seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
 }
 
-static void printnode_seq(struct seq_file *seq, int indent, struct node *n, 
+static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
 		   int pend, int cindex, int bits)
 {
 	putspace_seq(seq, indent);
@@ -2090,12 +2092,12 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
 	seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
 
 	if (IS_LEAF(n))
-		seq_printf(seq, "key=%d.%d.%d.%d\n", 
+		seq_printf(seq, "key=%d.%d.%d.%d\n",
 			   n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
 	else {
-		int plen=((struct tnode *)n)->pos;
+		int plen = ((struct tnode *)n)->pos;
 		t_key prf=MASK_PFX(n->key, plen);
-		seq_printf(seq, "key=%d.%d.%d.%d/%d\n", 
+		seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
 			   prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
 	}
 	if (IS_LEAF(n)) {
@@ -2103,14 +2105,14 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
 		struct fib_alias *fa;
 		int i;
 		for (i=32; i>=0; i--)
-		  if(find_leaf_info(&l->list, i)) {
-			
+		  if (find_leaf_info(&l->list, i)) {
+		
 				struct list_head *fa_head = get_fa_head(l, i);
-				
-				if(!fa_head)
+			
+				if (!fa_head)
 					continue;
 
-				if(list_empty(fa_head))
+				if (list_empty(fa_head))
 					continue;
 
 				putspace_seq(seq, indent+2);
@@ -2136,7 +2138,7 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
 			}
 	}
 	else if (IS_TNODE(n)) {
-		struct tnode *tn=(struct tnode *)n;
+		struct tnode *tn = (struct tnode *)n;
 		putspace_seq(seq, indent); seq_printf(seq, "|    ");
 		seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
 		printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
@@ -2152,7 +2154,7 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
 
 static void trie_dump_seq(struct seq_file *seq, struct trie *t)
 {
-	struct node *n=t->trie;
+	struct node *n = t->trie;
 	int cindex=0;
 	int indent=1;
 	int pend=0;
@@ -2164,7 +2166,7 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t)
 	if (n) {
 		printnode_seq(seq, indent, n, pend, cindex, 0);
 		if (IS_TNODE(n)) {
-			struct tnode *tn=(struct tnode *)n;
+			struct tnode *tn = (struct tnode *)n;
 			pend = tn->pos+tn->bits;
 			putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
 			indent += 3;
@@ -2172,42 +2174,42 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t)
 
 			while (tn && cindex < (1 << tn->bits)) {
 				if (tn->child[cindex]) {
-					
+				
 					/* Got a child */
-					
+				
 					printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
-					if (IS_LEAF(tn->child[cindex])) { 
+					if (IS_LEAF(tn->child[cindex])) {
 						cindex++;
-						
+					
 					}
 					else {
-						/* 
-						 * New tnode. Decend one level 
+						/*
+						 * New tnode. Decend one level
 						 */
-						
+					
 						depth++;
-						n=tn->child[cindex];
-						tn=(struct tnode *)n;
-						pend=tn->pos+tn->bits;
+						n = tn->child[cindex];
+						tn = (struct tnode *)n;
+						pend = tn->pos+tn->bits;
 						putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
 						indent+=3;
 						cindex=0;
 					}
 				}
-				else 
+				else
 					cindex++;
 
 				/*
-				 * Test if we are done 
+				 * Test if we are done
 				 */
-				
+			
 				while (cindex >= (1 << tn->bits)) {
 
 					/*
 					 * Move upwards and test for root
 					 * pop off all traversed  nodes
 					 */
-					
+				
 					if (NODE_PARENT(tn) == NULL) {
 						tn = NULL;
 						n = NULL;
@@ -2217,8 +2219,8 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t)
 						cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
 						tn = NODE_PARENT(tn);
 						cindex++;
-						n=(struct node *)tn;
-						pend=tn->pos+tn->bits;
+						n = (struct node *)tn;
+						pend = tn->pos+tn->bits;
 						indent-=3;
 						depth--;
 					}
@@ -2236,36 +2238,36 @@ static struct trie_stat *trie_stat_new(void)
 {
 	struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
 	int i;
-	
-	if(s) {
+
+	if (s) {
 		s->totdepth = 0;
 		s->maxdepth = 0;
 		s->tnodes = 0;
 		s->leaves = 0;
 		s->nullpointers = 0;
-		
+	
 		for(i=0; i< MAX_CHILDS; i++)
 			s->nodesizes[i] = 0;
 	}
 	return s;
-}    
+}
 
 static struct trie_stat *trie_collect_stats(struct trie *t)
 {
-	struct node *n=t->trie;
+	struct node *n = t->trie;
 	struct trie_stat *s = trie_stat_new();
 	int cindex = 0;
 	int indent = 1;
 	int pend = 0;
 	int depth = 0;
 
-	read_lock(&fib_lock);		
+	read_lock(&fib_lock);	
 
 	if (s) {
 		if (n) {
 			if (IS_TNODE(n)) {
 				struct tnode *tn = (struct tnode *)n;
-				pend=tn->pos+tn->bits;
+				pend = tn->pos+tn->bits;
 				indent += 3;
 				s->nodesizes[tn->bits]++;
 				depth++;
@@ -2273,26 +2275,26 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
 				while (tn && cindex < (1 << tn->bits)) {
 					if (tn->child[cindex]) {
 						/* Got a child */
-					
-						if (IS_LEAF(tn->child[cindex])) { 
+				
+						if (IS_LEAF(tn->child[cindex])) {
 							cindex++;
-						
+					
 							/* stats */
 							if (depth > s->maxdepth)
 								s->maxdepth = depth;
 							s->totdepth += depth;
 							s->leaves++;
 						}
-					
+				
 						else {
-							/* 
-							 * New tnode. Decend one level 
+							/*
+							 * New tnode. Decend one level
 							 */
-						
+					
 							s->tnodes++;
 							s->nodesizes[tn->bits]++;
 							depth++;
-						
+					
 							n = tn->child[cindex];
 							tn = (struct tnode *)n;
 							pend = tn->pos+tn->bits;
@@ -2303,13 +2305,13 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
 					}
 					else {
 						cindex++;
-						s->nullpointers++; 
+						s->nullpointers++;
 					}
 
 					/*
-					 * Test if we are done 
+					 * Test if we are done
 					 */
-				
+			
 					while (cindex >= (1 << tn->bits)) {
 
 						/*
@@ -2317,7 +2319,7 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
 						 * pop off all traversed  nodes
 						 */
 
-						
+					
 						if (NODE_PARENT(tn) == NULL) {
 							tn = NULL;
 							n = NULL;
@@ -2326,9 +2328,9 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
 						else {
 							cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
 							tn = NODE_PARENT(tn);
-							cindex++; 
+							cindex++;
 							n = (struct node *)tn;
-							pend=tn->pos+tn->bits;
+							pend = tn->pos+tn->bits;
 							indent -= 3;
 							depth--;
 						}
@@ -2339,7 +2341,7 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
 		}
 	}
 
-	read_unlock(&fib_lock);		
+	read_unlock(&fib_lock);	
 	return s;
 }
 
@@ -2375,7 +2377,7 @@ static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
 
 }
 
-/* 
+/*
  *	This outputs /proc/net/fib_triestats
  *
  *	It always works in backward compatibility mode.
@@ -2401,7 +2403,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
 			avdepth=0;
 		seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
 		seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
-				
+			
 		seq_printf(seq, "Leaves: %d\n", stat->leaves);
 		bytes += sizeof(struct leaf) * stat->leaves;
 		seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
@@ -2413,7 +2415,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
 			max--;
 		pointers = 0;
 
-		for (i = 1; i <= max; i++) 
+		for (i = 1; i <= max; i++)
 			if (stat->nodesizes[i] != 0) {
 				seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
 				pointers += (1<<i) * stat->nodesizes[i];
@@ -2444,30 +2446,30 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
 {
 	char bf[128];
-    
+
 	if (v == SEQ_START_TOKEN) {
-		seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n", 
+		seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
 			   sizeof(struct leaf), sizeof(struct tnode));
-		if (trie_local) 
+		if (trie_local)
 			collect_and_show(trie_local, seq);
 
-		if (trie_main) 
+		if (trie_main)
 			collect_and_show(trie_main, seq);
 	}
 	else {
 		snprintf(bf, sizeof(bf),
 			 "*\t%08X\t%08X", 200, 400);
-		
+	
 		seq_printf(seq, "%-127s\n", bf);
 	}
 	return 0;
 }
 
 static struct seq_operations fib_triestat_seq_ops = {
-	.start  = fib_triestat_seq_start,
-	.next   = fib_triestat_seq_next,
-	.stop   = fib_triestat_seq_stop,
-	.show   = fib_triestat_seq_show,
+	.start = fib_triestat_seq_start,
+	.next  = fib_triestat_seq_next,
+	.stop  = fib_triestat_seq_stop,
+	.show  = fib_triestat_seq_show,
 };
 
 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
@@ -2479,7 +2481,7 @@ static int fib_triestat_seq_open(struct inode *inode, struct file *file)
 	if (rc)
 		goto out_kfree;
 
-	seq	     = file->private_data;
+	seq = file->private_data;
 out:
 	return rc;
 out_kfree:
@@ -2487,11 +2489,11 @@ out_kfree:
 }
 
 static struct file_operations fib_triestat_seq_fops = {
-	.owner		= THIS_MODULE,
-	.open           = fib_triestat_seq_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release	= seq_release_private,
+	.owner	= THIS_MODULE,
+	.open	= fib_triestat_seq_open,
+	.read	= seq_read,
+	.llseek	= seq_lseek,
+	.release = seq_release_private,
 };
 
 int __init fib_stat_proc_init(void)
@@ -2536,7 +2538,7 @@ static void fib_trie_seq_stop(struct seq_file *seq, void *v)
 
 }
 
-/* 
+/*
  *	This outputs /proc/net/fib_trie.
  *
  *	It always works in backward compatibility mode.
@@ -2548,10 +2550,10 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
 	char bf[128];
 
 	if (v == SEQ_START_TOKEN) {
-		if (trie_local) 
+		if (trie_local)
 			trie_dump_seq(seq, trie_local);
 
-		if (trie_main) 
+		if (trie_main)
 			trie_dump_seq(seq, trie_main);
 	}
 
@@ -2565,10 +2567,10 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
 }
 
 static struct seq_operations fib_trie_seq_ops = {
-	.start  = fib_trie_seq_start,
-	.next   = fib_trie_seq_next,
-	.stop   = fib_trie_seq_stop,
-	.show   = fib_trie_seq_show,
+	.start = fib_trie_seq_start,
+	.next  = fib_trie_seq_next,
+	.stop  = fib_trie_seq_stop,
+	.show  = fib_trie_seq_show,
 };
 
 static int fib_trie_seq_open(struct inode *inode, struct file *file)
@@ -2580,7 +2582,7 @@ static int fib_trie_seq_open(struct inode *inode, struct file *file)
 	if (rc)
 		goto out_kfree;
 
-	seq	     = file->private_data;
+	seq = file->private_data;
 out:
 	return rc;
 out_kfree:
@@ -2588,11 +2590,11 @@ out_kfree:
 }
 
 static struct file_operations fib_trie_seq_fops = {
-	.owner		= THIS_MODULE,
-	.open           = fib_trie_seq_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release	= seq_release_private,
+	.owner	= THIS_MODULE,
+	.open	= fib_trie_seq_open,
+	.read	= seq_read,
+	.llseek	= seq_lseek,
+	.release= seq_release_private,
 };
 
 int __init fib_proc_init(void)
-- 
cgit 


From 0303770deb834c15ca664a9d741d40f893c92f4e Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 19 Jul 2005 14:03:34 -0700
Subject: [NET]: Make ipip/ip6_tunnel independant of XFRM

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig        |  1 -
 net/ipv4/ipip.c         | 36 ++++++++++++++++++++++++++++++++----
 net/ipv4/xfrm4_tunnel.c |  3 +--
 net/ipv6/Kconfig        |  1 -
 net/ipv6/ip6_tunnel.c   | 38 +++++++++++++++++++++++++++++++++-----
 5 files changed, 66 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index c7f1f9245bc9..fc561c0ae8e2 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -235,7 +235,6 @@ config IP_PNP_RARP
 #   bool '    IP: ARP support' CONFIG_IP_PNP_ARP		
 config NET_IPIP
 	tristate "IP: tunneling"
-	select INET_TUNNEL
 	---help---
 	  Tunneling means encapsulating data of one protocol type within
 	  another protocol and sending it over a channel that understands the
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 68a78731f722..c3947cd566b7 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -273,7 +273,7 @@ static void ipip_tunnel_uninit(struct net_device *dev)
 	dev_put(dev);
 }
 
-static void ipip_err(struct sk_buff *skb, void *__unused)
+static void ipip_err(struct sk_buff *skb, u32 info)
 {
 #ifndef I_WISH_WORLD_WERE_PERFECT
 
@@ -852,11 +852,39 @@ static int __init ipip_fb_tunnel_init(struct net_device *dev)
 	return 0;
 }
 
+#ifdef CONFIG_INET_TUNNEL
 static struct xfrm_tunnel ipip_handler = {
 	.handler	=	ipip_rcv,
 	.err_handler	=	ipip_err,
 };
 
+static inline int ipip_register(void)
+{
+	return xfrm4_tunnel_register(&ipip_handler);
+}
+
+static inline int ipip_unregister(void)
+{
+	return xfrm4_tunnel_deregister(&ipip_handler);
+}
+#else
+static struct net_protocol ipip_protocol = {
+	.handler	=	ipip_rcv,
+	.err_handler	=	ipip_err,
+	.no_policy	=	1,
+};
+
+static inline int ipip_register(void)
+{
+	return inet_add_protocol(&ipip_protocol, IPPROTO_IPIP);
+}
+
+static inline int ipip_unregister(void)
+{
+	return inet_del_protocol(&ipip_protocol, IPPROTO_IPIP);
+}
+#endif
+
 static char banner[] __initdata =
 	KERN_INFO "IPv4 over IPv4 tunneling driver\n";
 
@@ -866,7 +894,7 @@ static int __init ipip_init(void)
 
 	printk(banner);
 
-	if (xfrm4_tunnel_register(&ipip_handler) < 0) {
+	if (ipip_register() < 0) {
 		printk(KERN_INFO "ipip init: can't register tunnel\n");
 		return -EAGAIN;
 	}
@@ -888,13 +916,13 @@ static int __init ipip_init(void)
  err2:
 	free_netdev(ipip_fb_tunnel_dev);
  err1:
-	xfrm4_tunnel_deregister(&ipip_handler);
+	ipip_unregister();
 	goto out;
 }
 
 static void __exit ipip_fini(void)
 {
-	if (xfrm4_tunnel_deregister(&ipip_handler) < 0)
+	if (ipip_unregister() < 0)
 		printk(KERN_INFO "ipip close: can't deregister tunnel\n");
 
 	unregister_netdev(ipip_fb_tunnel_dev);
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index e1fe360ed27a..afbb0d4cc305 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -78,10 +78,9 @@ static int ipip_rcv(struct sk_buff *skb)
 static void ipip_err(struct sk_buff *skb, u32 info)
 {
 	struct xfrm_tunnel *handler = ipip_handler;
-	u32 arg = info;
 
 	if (handler)
-		handler->err_handler(skb, &arg);
+		handler->err_handler(skb, info);
 }
 
 static int ipip_init_state(struct xfrm_state *x)
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 95163cd52ae0..ab7a9124f985 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -91,7 +91,6 @@ config INET6_TUNNEL
 config IPV6_TUNNEL
 	tristate "IPv6: IPv6-in-IPv6 tunnel"
 	depends on IPV6
-	select INET6_TUNNEL
 	---help---
 	  Support for IPv6-in-IPv6 tunnels described in RFC 2473.
 
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index ba3b0c267f75..f39ddeae1eef 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1110,11 +1110,39 @@ ip6ip6_fb_tnl_dev_init(struct net_device *dev)
 	return 0;
 }
 
+#ifdef CONFIG_INET6_TUNNEL
 static struct xfrm6_tunnel ip6ip6_handler = {
-	.handler = ip6ip6_rcv,
-	.err_handler = ip6ip6_err,
+	.handler	= ip6ip6_rcv,
+	.err_handler	= ip6ip6_err,
 };
 
+static inline int ip6ip6_register(void)
+{
+	return xfrm6_tunnel_register(&ip6ip6_handler);
+}
+
+static inline int ip6ip6_unregister(void)
+{
+	return xfrm6_tunnel_unregister(&ip6ip6_handler);
+}
+#else
+static struct inet6_protocol xfrm6_tunnel_protocol = {
+	.handler	= ip6ip6_rcv,
+	.err_handler	= ip6ip6_err,
+	.flags		= INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
+static inline int ip6ip6_register(void)
+{
+	return inet6_add_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6);
+}
+
+static inline int ip6ip6_unregister(void)
+{
+	return inet6_del_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6);
+}
+#endif
+
 /**
  * ip6_tunnel_init - register protocol and reserve needed resources
  *
@@ -1125,7 +1153,7 @@ static int __init ip6_tunnel_init(void)
 {
 	int  err;
 
-	if (xfrm6_tunnel_register(&ip6ip6_handler) < 0) {
+	if (ip6ip6_register() < 0) {
 		printk(KERN_ERR "ip6ip6 init: can't register tunnel\n");
 		return -EAGAIN;
 	}
@@ -1144,7 +1172,7 @@ static int __init ip6_tunnel_init(void)
 	}
 	return 0;
 fail:
-	xfrm6_tunnel_deregister(&ip6ip6_handler);
+	ip6ip6_unregister();
 	return err;
 }
 
@@ -1154,7 +1182,7 @@ fail:
 
 static void __exit ip6_tunnel_cleanup(void)
 {
-	if (xfrm6_tunnel_deregister(&ip6ip6_handler) < 0)
+	if (ip6ip6_unregister() < 0)
 		printk(KERN_INFO "ip6ip6 close: can't deregister tunnel\n");
 
 	unregister_netdev(ip6ip6_fb_tnl_dev);
-- 
cgit 


From b72f6eccb0fc516070f2de469f73870cb6cb1149 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 19 Jul 2005 14:13:54 -0700
Subject: [NET]: Fix tc_verd thinko in skb_clone()

It was overwriting the computer n->tc_verd value over
and over with skb->tc_verd, by mistake.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d9f7b06fe886..7eab867ede59 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -377,8 +377,8 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 	C(tc_index);
 #ifdef CONFIG_NET_CLS_ACT
 	n->tc_verd = SET_TC_VERD(skb->tc_verd,0);
-	n->tc_verd = CLR_TC_OK2MUNGE(skb->tc_verd);
-	n->tc_verd = CLR_TC_MUNGED(skb->tc_verd);
+	n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
+	n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
 	C(input_dev);
 	C(tc_classid);
 #endif
-- 
cgit 


From 4acdbdbe5089c06d5e0c7e96783fcc4414ded00a Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 21 Jul 2005 13:14:46 -0700
Subject: [NETFILTER]: ip_conntrack_expect_related must not free expectation

If a connection tracking helper tells us to expect a connection, and
we're already expecting that connection, we simply free the one they
gave us and return success.

The problem is that NAT helpers (eg. FTP) have to allocate the
expectation first (to see what port is available) then rewrite the
packet.  If that rewrite fails, they try to remove the expectation,
but it was freed in ip_conntrack_expect_related.

This is one example of a larger problem: having registered the
expectation, the pointer is no longer ours to use.  Reference counting
is needed for ctnetlink anyway, so introduce it now.

To have a single "put" path, we need to grab the reference to the
connection on creation, rather than open-coding it in the caller.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_conntrack_amanda.c     |  8 +++---
 net/ipv4/netfilter/ip_conntrack_core.c       | 40 +++++++++++++---------------
 net/ipv4/netfilter/ip_conntrack_ftp.c        | 14 +++++-----
 net/ipv4/netfilter/ip_conntrack_irc.c        |  8 +++---
 net/ipv4/netfilter/ip_conntrack_standalone.c |  2 +-
 net/ipv4/netfilter/ip_conntrack_tftp.c       |  8 +++---
 net/ipv4/netfilter/ip_nat_amanda.c           |  4 +--
 net/ipv4/netfilter/ip_nat_ftp.c              |  4 +--
 net/ipv4/netfilter/ip_nat_irc.c              |  4 +--
 net/ipv4/netfilter/ip_nat_tftp.c             |  4 +--
 10 files changed, 39 insertions(+), 57 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index a78a320eee08..01e1b58322a9 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -101,14 +101,13 @@ static int help(struct sk_buff **pskb,
 		if (port == 0 || len > 5)
 			break;
 
-		exp = ip_conntrack_expect_alloc();
+		exp = ip_conntrack_expect_alloc(ct);
 		if (exp == NULL) {
 			ret = NF_DROP;
 			goto out;
 		}
 
 		exp->expectfn = NULL;
-		exp->master = ct;
 
 		exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
 		exp->tuple.src.u.tcp.port = 0;
@@ -126,10 +125,9 @@ static int help(struct sk_buff **pskb,
 			ret = ip_nat_amanda_hook(pskb, ctinfo,
 						 tmp - amanda_buffer,
 						 len, exp);
-		else if (ip_conntrack_expect_related(exp) != 0) {
-			ip_conntrack_expect_free(exp);
+		else if (ip_conntrack_expect_related(exp) != 0)
 			ret = NF_DROP;
-		}
+		ip_conntrack_expect_put(exp);
 	}
 
 out:
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 4b78ebeb6635..14af55cad5d6 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -137,19 +137,12 @@ ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
 
 
 /* ip_conntrack_expect helper functions */
-static void destroy_expect(struct ip_conntrack_expect *exp)
-{
-	ip_conntrack_put(exp->master);
-	IP_NF_ASSERT(!timer_pending(&exp->timeout));
-	kmem_cache_free(ip_conntrack_expect_cachep, exp);
-	CONNTRACK_STAT_INC(expect_delete);
-}
-
 static void unlink_expect(struct ip_conntrack_expect *exp)
 {
 	ASSERT_WRITE_LOCK(&ip_conntrack_lock);
+	IP_NF_ASSERT(!timer_pending(&exp->timeout));
 	list_del(&exp->list);
-	/* Logically in destroy_expect, but we hold the lock here. */
+	CONNTRACK_STAT_INC(expect_delete);
 	exp->master->expecting--;
 }
 
@@ -160,7 +153,7 @@ static void expectation_timed_out(unsigned long ul_expect)
 	write_lock_bh(&ip_conntrack_lock);
 	unlink_expect(exp);
 	write_unlock_bh(&ip_conntrack_lock);
-	destroy_expect(exp);
+	ip_conntrack_expect_put(exp);
 }
 
 /* If an expectation for this connection is found, it gets delete from
@@ -198,7 +191,7 @@ static void remove_expectations(struct ip_conntrack *ct)
 	list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
 		if (i->master == ct && del_timer(&i->timeout)) {
 			unlink_expect(i);
-			destroy_expect(i);
+			ip_conntrack_expect_put(i);
 		}
 	}
 }
@@ -537,7 +530,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
 	if (exp) {
 		if (exp->expectfn)
 			exp->expectfn(conntrack, exp);
-		destroy_expect(exp);
+		ip_conntrack_expect_put(exp);
 	}
 
 	return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
@@ -729,14 +722,14 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
 		if (expect_matches(i, exp) && del_timer(&i->timeout)) {
 			unlink_expect(i);
 			write_unlock_bh(&ip_conntrack_lock);
-			destroy_expect(i);
+			ip_conntrack_expect_put(i);
 			return;
 		}
 	}
 	write_unlock_bh(&ip_conntrack_lock);
 }
 
-struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
+struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
 {
 	struct ip_conntrack_expect *new;
 
@@ -745,18 +738,23 @@ struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
 		DEBUGP("expect_related: OOM allocating expect\n");
 		return NULL;
 	}
-	new->master = NULL;
+	new->master = me;
+	atomic_inc(&new->master->ct_general.use);
+	atomic_set(&new->use, 1);
 	return new;
 }
 
-void ip_conntrack_expect_free(struct ip_conntrack_expect *expect)
+void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
 {
-	kmem_cache_free(ip_conntrack_expect_cachep, expect);
+	if (atomic_dec_and_test(&exp->use)) {
+		ip_conntrack_put(exp->master);
+		kmem_cache_free(ip_conntrack_expect_cachep, exp);
+	}
 }
 
 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
 {
-	atomic_inc(&exp->master->ct_general.use);
+	atomic_inc(&exp->use);
 	exp->master->expecting++;
 	list_add(&exp->list, &ip_conntrack_expect_list);
 
@@ -778,7 +776,7 @@ static void evict_oldest_expect(struct ip_conntrack *master)
 		if (i->master == master) {
 			if (del_timer(&i->timeout)) {
 				unlink_expect(i);
-				destroy_expect(i);
+				ip_conntrack_expect_put(i);
 			}
 			break;
 		}
@@ -810,8 +808,6 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
 			/* Refresh timer: if it's dying, ignore.. */
 			if (refresh_timer(i)) {
 				ret = 0;
-				/* We don't need the one they've given us. */
-				ip_conntrack_expect_free(expect);
 				goto out;
 			}
 		} else if (expect_clash(i, expect)) {
@@ -881,7 +877,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
 	list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
 		if (exp->master->helper == me && del_timer(&exp->timeout)) {
 			unlink_expect(exp);
-			destroy_expect(exp);
+			ip_conntrack_expect_put(exp);
 		}
 	}
 	/* Get rid of expecteds, set helpers to NULL. */
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index fea6dd2a00b6..7a3b773be3f9 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -376,7 +376,7 @@ static int help(struct sk_buff **pskb,
 	       fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff);
 			 
 	/* Allocate expectation which will be inserted */
-	exp = ip_conntrack_expect_alloc();
+	exp = ip_conntrack_expect_alloc(ct);
 	if (exp == NULL) {
 		ret = NF_DROP;
 		goto out;
@@ -403,8 +403,7 @@ static int help(struct sk_buff **pskb,
 		   networks, or the packet filter itself). */
 		if (!loose) {
 			ret = NF_ACCEPT;
-			ip_conntrack_expect_free(exp);
-			goto out_update_nl;
+			goto out_put_expect;
 		}
 		exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16)
 					 | (array[2] << 8) | array[3]);
@@ -419,7 +418,6 @@ static int help(struct sk_buff **pskb,
 		  { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
 
 	exp->expectfn = NULL;
-	exp->master = ct;
 
 	/* Now, NAT might want to mangle the packet, and register the
 	 * (possibly changed) expectation itself. */
@@ -428,13 +426,15 @@ static int help(struct sk_buff **pskb,
 				      matchoff, matchlen, exp, &seq);
 	else {
 		/* Can't expect this?  Best to drop packet now. */
-		if (ip_conntrack_expect_related(exp) != 0) {
-			ip_conntrack_expect_free(exp);
+		if (ip_conntrack_expect_related(exp) != 0)
 			ret = NF_DROP;
-		} else
+		else
 			ret = NF_ACCEPT;
 	}
 
+out_put_expect:
+	ip_conntrack_expect_put(exp);
+
 out_update_nl:
 	/* Now if this ends in \n, update ftp info.  Seq may have been
 	 * adjusted by NAT code. */
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index cd98772cc332..4a28f297d502 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -197,7 +197,7 @@ static int help(struct sk_buff **pskb,
 				continue;
 			}
 
-			exp = ip_conntrack_expect_alloc();
+			exp = ip_conntrack_expect_alloc(ct);
 			if (exp == NULL) {
 				ret = NF_DROP;
 				goto out;
@@ -221,16 +221,14 @@ static int help(struct sk_buff **pskb,
 				{ { 0, { 0 } },
 				  { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
 			exp->expectfn = NULL;
-			exp->master = ct;
 			if (ip_nat_irc_hook)
 				ret = ip_nat_irc_hook(pskb, ctinfo, 
 						      addr_beg_p - ib_ptr,
 						      addr_end_p - addr_beg_p,
 						      exp);
-			else if (ip_conntrack_expect_related(exp) != 0) {
-				ip_conntrack_expect_free(exp);
+			else if (ip_conntrack_expect_related(exp) != 0)
 				ret = NF_DROP;
-			}
+			ip_conntrack_expect_put(exp);
 			goto out;
 		} /* for .. NUM_DCCPROTO */
 	} /* while data < ... */
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 1dd824f3cf0a..61798c46e91d 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -985,7 +985,7 @@ EXPORT_SYMBOL(ip_ct_refresh_acct);
 EXPORT_SYMBOL(ip_ct_protos);
 EXPORT_SYMBOL(ip_ct_find_proto);
 EXPORT_SYMBOL(ip_conntrack_expect_alloc);
-EXPORT_SYMBOL(ip_conntrack_expect_free);
+EXPORT_SYMBOL(ip_conntrack_expect_put);
 EXPORT_SYMBOL(ip_conntrack_expect_related);
 EXPORT_SYMBOL(ip_conntrack_unexpect_related);
 EXPORT_SYMBOL(ip_conntrack_tuple_taken);
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
index 992fac3e36ee..f8ff170f390a 100644
--- a/net/ipv4/netfilter/ip_conntrack_tftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_tftp.c
@@ -65,7 +65,7 @@ static int tftp_help(struct sk_buff **pskb,
 		DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 		DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 
-		exp = ip_conntrack_expect_alloc();
+		exp = ip_conntrack_expect_alloc(ct);
 		if (exp == NULL)
 			return NF_DROP;
 
@@ -75,17 +75,15 @@ static int tftp_help(struct sk_buff **pskb,
 		exp->mask.dst.u.udp.port = 0xffff;
 		exp->mask.dst.protonum = 0xff;
 		exp->expectfn = NULL;
-		exp->master = ct;
 
 		DEBUGP("expect: ");
 		DUMP_TUPLE(&exp->tuple);
 		DUMP_TUPLE(&exp->mask);
 		if (ip_nat_tftp_hook)
 			ret = ip_nat_tftp_hook(pskb, ctinfo, exp);
-		else if (ip_conntrack_expect_related(exp) != 0) {
-			ip_conntrack_expect_free(exp);
+		else if (ip_conntrack_expect_related(exp) != 0)
 			ret = NF_DROP;
-		}
+		ip_conntrack_expect_put(exp);
 		break;
 	case TFTP_OPCODE_DATA:
 	case TFTP_OPCODE_ACK:
diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c
index da1f412583ed..706c8074f422 100644
--- a/net/ipv4/netfilter/ip_nat_amanda.c
+++ b/net/ipv4/netfilter/ip_nat_amanda.c
@@ -56,10 +56,8 @@ static unsigned int help(struct sk_buff **pskb,
 			break;
 	}
 
-	if (port == 0) {
-		ip_conntrack_expect_free(exp);
+	if (port == 0)
 		return NF_DROP;
-	}
 
 	sprintf(buffer, "%u", port);
 	ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo,
diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c
index c6000e794ad6..d83757a70d9f 100644
--- a/net/ipv4/netfilter/ip_nat_ftp.c
+++ b/net/ipv4/netfilter/ip_nat_ftp.c
@@ -143,10 +143,8 @@ static unsigned int ip_nat_ftp(struct sk_buff **pskb,
 			break;
 	}
 
-	if (port == 0) {
-		ip_conntrack_expect_free(exp);
+	if (port == 0)
 		return NF_DROP;
-	}
 
 	if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo,
 			  seq)) {
diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c
index 9c1ca3381d56..de31942babe3 100644
--- a/net/ipv4/netfilter/ip_nat_irc.c
+++ b/net/ipv4/netfilter/ip_nat_irc.c
@@ -65,10 +65,8 @@ static unsigned int help(struct sk_buff **pskb,
 			break;
 	}
 
-	if (port == 0) {
-		ip_conntrack_expect_free(exp);
+	if (port == 0)
 		return NF_DROP;
-	}
 
 	/*      strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27
 	 *      strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28
diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c
index 0343e0d64674..2215317c76b7 100644
--- a/net/ipv4/netfilter/ip_nat_tftp.c
+++ b/net/ipv4/netfilter/ip_nat_tftp.c
@@ -45,10 +45,8 @@ static unsigned int help(struct sk_buff **pskb,
 	exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port;
 	exp->dir = IP_CT_DIR_REPLY;
 	exp->expectfn = ip_nat_follow_master;
-	if (ip_conntrack_expect_related(exp) != 0) {
-		ip_conntrack_expect_free(exp);
+	if (ip_conntrack_expect_related(exp) != 0)
 		return NF_DROP;
-	}
 	return NF_ACCEPT;
 }
 
-- 
cgit 


From 28e212fb360ce2568edd60b93d60683d5ad24146 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 22 Jul 2005 11:47:25 -0700
Subject: [PKT_SCHED]: Kill TCF_META_ID_REALDEV from meta ematch.

It won't exist any longer when we shrink the SKB in 2.6.14,
and we should kill this off before anyone in userspace starts
using it.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Thomas Graf <tgraf@suug.ch>
---
 net/sched/em_meta.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'net')

diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index feefcbfd03d0..179efb5bc9b3 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -180,16 +180,6 @@ META_COLLECTOR(var_indev)
 	*err = var_dev(skb->input_dev, dst);
 }
 
-META_COLLECTOR(int_realdev)
-{
-	*err = int_dev(skb->real_dev, dst);
-}
-
-META_COLLECTOR(var_realdev)
-{
-	*err = var_dev(skb->real_dev, dst);
-}
-
 /**************************************************************************
  * skb attributes
  **************************************************************************/
@@ -501,7 +491,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 	[TCF_META_TYPE_VAR] = {
 		[META_ID(DEV)]			= META_FUNC(var_dev),
 		[META_ID(INDEV)]		= META_FUNC(var_indev),
-		[META_ID(REALDEV)]		= META_FUNC(var_realdev),
 		[META_ID(SK_BOUND_IF)] 		= META_FUNC(var_sk_bound_if),
 	},
 	[TCF_META_TYPE_INT] = {
@@ -511,7 +500,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 		[META_ID(LOADAVG_2)]		= META_FUNC(int_loadavg_2),
 		[META_ID(DEV)]			= META_FUNC(int_dev),
 		[META_ID(INDEV)]		= META_FUNC(int_indev),
-		[META_ID(REALDEV)]		= META_FUNC(int_realdev),
 		[META_ID(PRIORITY)]		= META_FUNC(int_priority),
 		[META_ID(PROTOCOL)]		= META_FUNC(int_protocol),
 		[META_ID(PKTTYPE)]		= META_FUNC(int_pkttype),
-- 
cgit 


From 4c1217deeb148ff8ab838ba4f1875d0f52dea343 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 22 Jul 2005 12:49:30 -0700
Subject: [NETFILTER]: Fix deadlock in ip6_queue

Already fixed in ip_queue, ip6_queue was missed.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/netfilter/ip6_queue.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index 750943e2d34e..5493180f0d44 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -76,7 +76,9 @@ static DECLARE_MUTEX(ipqnl_sem);
 static void
 ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
 {
+	local_bh_disable();
 	nf_reinject(entry->skb, entry->info, verdict);
+	local_bh_enable();
 	kfree(entry);
 }
 
-- 
cgit 


From d04b4f8c1c9766e49fad6a141fc61cb30db69a5c Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 22 Jul 2005 12:50:29 -0700
Subject: [NETFILTER]: Fix potential memory corruption in NAT code (aka memory
 NAT)

The portptr pointing to the port in the conntrack tuple is declared static,
which could result in memory corruption when two packets of the same
protocol are NATed at the same time and one conntrack goes away.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_nat_proto_tcp.c | 3 ++-
 net/ipv4/netfilter/ip_nat_proto_udp.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
index a91cfceff272..a98e36d2b3c6 100644
--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -40,7 +40,8 @@ tcp_unique_tuple(struct ip_conntrack_tuple *tuple,
 		 enum ip_nat_manip_type maniptype,
 		 const struct ip_conntrack *conntrack)
 {
-	static u_int16_t port, *portptr;
+	static u_int16_t port;
+	u_int16_t *portptr;
 	unsigned int range_size, min, i;
 
 	if (maniptype == IP_NAT_MANIP_SRC)
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
index c669e3b5f5d0..9f66e5625664 100644
--- a/net/ipv4/netfilter/ip_nat_proto_udp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -41,7 +41,8 @@ udp_unique_tuple(struct ip_conntrack_tuple *tuple,
 		 enum ip_nat_manip_type maniptype,
 		 const struct ip_conntrack *conntrack)
 {
-	static u_int16_t port, *portptr;
+	static u_int16_t port;
+	u_int16_t *portptr;
 	unsigned int range_size, min, i;
 
 	if (maniptype == IP_NAT_MANIP_SRC)
-- 
cgit 


From 21f930e4abdcb9649f26e5b959c14dddee4e600b Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 22 Jul 2005 12:51:03 -0700
Subject: [NETFILTER]: Wait until all references to ip_conntrack_untracked are
 dropped on unload

Fixes a crash when unloading ip_conntrack.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_conntrack_core.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 14af55cad5d6..63bf88264980 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -1107,6 +1107,9 @@ void ip_conntrack_cleanup(void)
 		schedule();
 		goto i_see_dead_people;
 	}
+	/* wait until all references to ip_conntrack_untracked are dropped */
+	while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
+		schedule();
 
 	kmem_cache_destroy(ip_conntrack_cachep);
 	kmem_cache_destroy(ip_conntrack_expect_cachep);
-- 
cgit 


From 74bb421da7f39e70ab636ad46ef85ea1178786c5 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 22 Jul 2005 12:51:38 -0700
Subject: [NETFILTER]: Use correct byteorder in ICMP NAT

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_nat_proto_icmp.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
index a558cf0eee8a..6596c9ee1655 100644
--- a/net/ipv4/netfilter/ip_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -35,16 +35,17 @@ icmp_unique_tuple(struct ip_conntrack_tuple *tuple,
 		  const struct ip_conntrack *conntrack)
 {
 	static u_int16_t id;
-	unsigned int range_size
-		= (unsigned int)range->max.icmp.id - range->min.icmp.id + 1;
+	unsigned int range_size;
 	unsigned int i;
 
+	range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1;
 	/* If no range specified... */
 	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
 		range_size = 0xFFFF;
 
 	for (i = 0; i < range_size; i++, id++) {
-		tuple->src.u.icmp.id = range->min.icmp.id + (id % range_size);
+		tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
+		                             (id % range_size));
 		if (!ip_nat_used_tuple(tuple, conntrack))
 			return 1;
 	}
-- 
cgit 


From d3984a6b6abac6203868f0e9095c0ed9e33ece03 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 22 Jul 2005 12:52:47 -0700
Subject: [NETFILTER]: Fix ip6t_LOG MAC format

I broke this in the patch that consolidated MAC logging.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/netfilter/ip6t_LOG.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index c44685e391b7..a692e26a4fa3 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -373,9 +373,10 @@ ip6t_log_packet(unsigned int hooknum,
 		in ? in->name : "",
 		out ? out->name : "");
 	if (in && !out) {
+		unsigned int len;
 		/* MAC logging for input chain only. */
 		printk("MAC=");
-		if (skb->dev && skb->dev->hard_header_len &&
+		if (skb->dev && (len = skb->dev->hard_header_len) &&
 		    skb->mac.raw != skb->nh.raw) {
 			unsigned char *p = skb->mac.raw;
 			int i;
@@ -384,9 +385,11 @@ ip6t_log_packet(unsigned int hooknum,
 			    (p -= ETH_HLEN) < skb->head)
 				p = NULL;
 
-			if (p != NULL)
-				for (i = 0; i < skb->dev->hard_header_len; i++)
-					printk("%02x", p[i]);
+			if (p != NULL) {
+				for (i = 0; i < len; i++)
+					printk("%02x%s", p[i],
+					       i == len - 1 ? "" : ":");
+			}
 			printk(" ");
 
 			if (skb->dev->type == ARPHRD_SIT) {
-- 
cgit 


From 261688d01ec07d3a265b8ace6ec68310fbd96a96 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 22 Jul 2005 14:43:52 -0700
Subject: [PKT_SCHED]: em_meta: Kill TCF_META_ID_{INDEV,SECURITY,TCVERDICT}

More unusable TCF_META_* match types that need to get eliminated
before 2.6.13 goes out the door.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Thomas Graf <tgraf@suug.ch>
---
 net/sched/em_meta.c | 28 +++-------------------------
 1 file changed, 3 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 179efb5bc9b3..a18b924743d9 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -27,17 +27,17 @@
  * 	         lvalue                                   rvalue
  * 	      +-----------+                           +-----------+
  * 	      | type: INT |                           | type: INT |
- * 	 def  | id: INDEV |                           | id: VALUE |
+ * 	 def  | id: DEV   |                           | id: VALUE |
  * 	      | data:     |                           | data: 3   |
  * 	      +-----------+                           +-----------+
  * 	            |                                       |
- * 	            ---> meta_ops[INT][INDEV](...)          |
+ * 	            ---> meta_ops[INT][DEV](...)            |
  *	                      |                             |
  * 	            -----------                             |
  * 	            V                                       V
  * 	      +-----------+                           +-----------+
  * 	      | type: INT |                           | type: INT |
- * 	 obj  | id: INDEV |                           | id: VALUE |
+ * 	 obj  | id: DEV |                             | id: VALUE |
  * 	      | data: 2   |<--data got filled out     | data: 3   |
  * 	      +-----------+                           +-----------+
  * 	            |                                         |
@@ -170,16 +170,6 @@ META_COLLECTOR(var_dev)
 	*err = var_dev(skb->dev, dst);
 }
 
-META_COLLECTOR(int_indev)
-{
-	*err = int_dev(skb->input_dev, dst);
-}
-
-META_COLLECTOR(var_indev)
-{
-	*err = var_dev(skb->input_dev, dst);
-}
-
 /**************************************************************************
  * skb attributes
  **************************************************************************/
@@ -235,13 +225,6 @@ META_COLLECTOR(int_tcindex)
 	dst->value = skb->tc_index;
 }
 
-#ifdef CONFIG_NET_CLS_ACT
-META_COLLECTOR(int_tcverd)
-{
-	dst->value = skb->tc_verd;
-}
-#endif
-
 /**************************************************************************
  * Routing
  **************************************************************************/
@@ -490,7 +473,6 @@ struct meta_ops
 static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 	[TCF_META_TYPE_VAR] = {
 		[META_ID(DEV)]			= META_FUNC(var_dev),
-		[META_ID(INDEV)]		= META_FUNC(var_indev),
 		[META_ID(SK_BOUND_IF)] 		= META_FUNC(var_sk_bound_if),
 	},
 	[TCF_META_TYPE_INT] = {
@@ -499,7 +481,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 		[META_ID(LOADAVG_1)]		= META_FUNC(int_loadavg_1),
 		[META_ID(LOADAVG_2)]		= META_FUNC(int_loadavg_2),
 		[META_ID(DEV)]			= META_FUNC(int_dev),
-		[META_ID(INDEV)]		= META_FUNC(int_indev),
 		[META_ID(PRIORITY)]		= META_FUNC(int_priority),
 		[META_ID(PROTOCOL)]		= META_FUNC(int_protocol),
 		[META_ID(PKTTYPE)]		= META_FUNC(int_pkttype),
@@ -510,9 +491,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 		[META_ID(NFMARK)]		= META_FUNC(int_nfmark),
 #endif
 		[META_ID(TCINDEX)]		= META_FUNC(int_tcindex),
-#ifdef CONFIG_NET_CLS_ACT
-		[META_ID(TCVERDICT)]		= META_FUNC(int_tcverd),
-#endif
 #ifdef CONFIG_NET_CLS_ROUTE
 		[META_ID(RTCLASSID)]		= META_FUNC(int_rtclassid),
 #endif
-- 
cgit 


From 227510c7f175c44b12cdff6eab316e53dbf71f92 Mon Sep 17 00:00:00 2001
From: Cal Peake <cp@absolutedigital.net>
Date: Sun, 24 Jul 2005 19:30:06 -0700
Subject: [IPV6]: fix implicit declaration of function
 `xfrm6_tunnel_unregister'

Signed-off-by: Cal Peake <cp@absolutedigital.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index f39ddeae1eef..09613729404c 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1123,7 +1123,7 @@ static inline int ip6ip6_register(void)
 
 static inline int ip6ip6_unregister(void)
 {
-	return xfrm6_tunnel_unregister(&ip6ip6_handler);
+	return xfrm6_tunnel_deregister(&ip6ip6_handler);
 }
 #else
 static struct inet6_protocol xfrm6_tunnel_protocol = {
-- 
cgit 


From 7686ee1ad976efeddf10583f013462c66408ae51 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sun, 24 Jul 2005 19:44:23 -0700
Subject: [EMATCH]: Remove feature ifdefs in meta ematch.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/em_meta.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index a18b924743d9..00eae5f9a01a 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -209,12 +209,14 @@ META_COLLECTOR(int_maclen)
  * Netfilter
  **************************************************************************/
 
-#ifdef CONFIG_NETFILTER
 META_COLLECTOR(int_nfmark)
 {
+#ifdef CONFIG_NETFILTER
 	dst->value = skb->nfmark;
-}
+#else
+	dst->value = 0;
 #endif
+}
 
 /**************************************************************************
  * Traffic Control
@@ -229,15 +231,17 @@ META_COLLECTOR(int_tcindex)
  * Routing
  **************************************************************************/
 
-#ifdef CONFIG_NET_CLS_ROUTE
 META_COLLECTOR(int_rtclassid)
 {
 	if (unlikely(skb->dst == NULL))
 		*err = -1;
 	else
+#ifdef CONFIG_NET_CLS_ROUTE
 		dst->value = skb->dst->tclassid;
-}
+#else
+		dst->value = 0;
 #endif
+}
 
 META_COLLECTOR(int_rtiif)
 {
@@ -487,13 +491,9 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 		[META_ID(PKTLEN)]		= META_FUNC(int_pktlen),
 		[META_ID(DATALEN)]		= META_FUNC(int_datalen),
 		[META_ID(MACLEN)]		= META_FUNC(int_maclen),
-#ifdef CONFIG_NETFILTER
 		[META_ID(NFMARK)]		= META_FUNC(int_nfmark),
-#endif
 		[META_ID(TCINDEX)]		= META_FUNC(int_tcindex),
-#ifdef CONFIG_NET_CLS_ROUTE
 		[META_ID(RTCLASSID)]		= META_FUNC(int_rtclassid),
-#endif
 		[META_ID(RTIIF)]		= META_FUNC(int_rtiif),
 		[META_ID(SK_FAMILY)]		= META_FUNC(int_sk_family),
 		[META_ID(SK_STATE)]		= META_FUNC(int_sk_state),
-- 
cgit 


From a4f1bac62564049ea4718c4624b0fadc9f597c84 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 26 Jul 2005 15:43:17 -0700
Subject: [XFRM]: Fix possible overflow of sock->sk_policy

Spotted by, and original patch by, Balazs Scheidler.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_user.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index ecade4893a13..8da3e25b2c4c 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1350,6 +1350,9 @@ static struct xfrm_policy *xfrm_compile_policy(u16 family, int opt,
 	if (nr > XFRM_MAX_DEPTH)
 		return NULL;
 
+	if (p->dir > XFRM_POLICY_OUT)
+		return NULL;
+
 	xp = xfrm_policy_alloc(GFP_KERNEL);
 	if (xp == NULL) {
 		*dir = -ENOBUFS;
-- 
cgit 


From eaa1c5d05947819643b2e72cbfc51ae2ddcf1991 Mon Sep 17 00:00:00 2001
From: "Hans-Juergen Tappe (SYSGO AG)" <hjt@sysgo.com>
Date: Wed, 27 Jul 2005 13:00:04 -0700
Subject: [IPV4]: Fix Kconfig syntax error

From: "Hans-Juergen Tappe (SYSGO AG)" <hjt@sysgo.com>

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index fc561c0ae8e2..0b3d9f1d8069 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -124,7 +124,7 @@ config IP_ROUTE_MULTIPATH
 
 config IP_ROUTE_MULTIPATH_CACHED
 	bool "IP: equal cost multipath with caching support (EXPERIMENTAL)"
-	depends on: IP_ROUTE_MULTIPATH
+	depends on IP_ROUTE_MULTIPATH
 	help
 	  Normally, equal cost multipath routing is not supported by the
 	  routing cache. If you say Y here, alternative routes are cached
-- 
cgit 


From a77be819f94fc55627ee257f496198ad703aaad4 Mon Sep 17 00:00:00 2001
From: Kyle Moffett <mrmacman_g4@mac.com>
Date: Wed, 27 Jul 2005 14:22:30 -0700
Subject: [NET]: Fix setsockopt locking bug

On Sparc, SO_DONTLINGER support resulted in sock_reset_flag being
called without lock_sock().

Signed-off-by: Kyle Moffett <mrmacman_g4@mac.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 8b35ccdc2b3b..12f6d9a2a522 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -206,13 +206,14 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 	 */
 
 #ifdef SO_DONTLINGER		/* Compatibility item... */
-	switch (optname) {
-		case SO_DONTLINGER:
-			sock_reset_flag(sk, SOCK_LINGER);
-			return 0;
+	if (optname == SO_DONTLINGER) {
+		lock_sock(sk);
+		sock_reset_flag(sk, SOCK_LINGER);
+		release_sock(sk);
+		return 0;
 	}
-#endif	
-		
+#endif
+	
   	if(optlen<sizeof(int))
   		return(-EINVAL);
   	
-- 
cgit 


From 7cee432a22bb328ea7a4012dacc5a3471fabeb07 Mon Sep 17 00:00:00 2001
From: Nick Sillik <n.sillik@temple.edu>
Date: Wed, 27 Jul 2005 14:46:03 -0700
Subject: [NETFILTER]: Fix -Wunder error in ip_conntrack_core.c

Signed-off-by: Nick Sillik <n.sillik@temple.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_conntrack_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 63bf88264980..86f04e41dd8e 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -510,7 +510,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
 		/* Welcome, Mr. Bond.  We've been expecting you... */
 		__set_bit(IPS_EXPECTED_BIT, &conntrack->status);
 		conntrack->master = exp->master;
-#if CONFIG_IP_NF_CONNTRACK_MARK
+#ifdef CONFIG_IP_NF_CONNTRACK_MARK
 		conntrack->mark = exp->master->mark;
 #endif
 		nf_conntrack_get(&conntrack->master->ct_general);
-- 
cgit 


From 5e43db7730e7cef7d37968ea789c41392519a864 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Wed, 27 Jul 2005 15:24:42 -0700
Subject: [NET]: Move in_aton from net/ipv4/utils.c to net/core/utils.c

Move in_aton to allow netpoll and pktgen to work without the rest of
the IPv4 stack. Fix whitespace and add comment for the odd placement.

Delete now-empty net/ipv4/utils.c

Re-enable netpoll/netconsole without CONFIG_INET

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/utils.c  | 37 +++++++++++++++++++++++++++++++++-
 net/ipv4/Makefile |  2 +-
 net/ipv4/utils.c  | 59 -------------------------------------------------------
 3 files changed, 37 insertions(+), 61 deletions(-)
 delete mode 100644 net/ipv4/utils.c

(limited to 'net')

diff --git a/net/core/utils.c b/net/core/utils.c
index e11a8654f363..88eb8b68e26b 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -23,10 +23,10 @@
 #include <linux/percpu.h>
 #include <linux/init.h>
 
+#include <asm/byteorder.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
 
-
 /*
   This is a maximally equidistributed combined Tausworthe generator
   based on code from GNU Scientific Library 1.5 (30 Jun 2004)
@@ -153,3 +153,38 @@ int net_ratelimit(void)
 EXPORT_SYMBOL(net_random);
 EXPORT_SYMBOL(net_ratelimit);
 EXPORT_SYMBOL(net_srandom);
+
+/*
+ * Convert an ASCII string to binary IP.
+ * This is outside of net/ipv4/ because various code that uses IP addresses
+ * is otherwise not dependent on the TCP/IP stack.
+ */
+
+__u32 in_aton(const char *str)
+{
+	unsigned long l;
+	unsigned int val;
+	int i;
+
+	l = 0;
+	for (i = 0; i < 4; i++)
+	{
+		l <<= 8;
+		if (*str != '\0')
+		{
+			val = 0;
+			while (*str != '\0' && *str != '.')
+			{
+				val *= 10;
+				val += *str - '0';
+				str++;
+			}
+			l |= val;
+			if (*str != '\0')
+				str++;
+		}
+	}
+	return(htonl(l));
+}
+
+EXPORT_SYMBOL(in_aton);
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 5718cdb3a61e..55dc6cca1e7b 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the Linux TCP/IP (INET) layer.
 #
 
-obj-y     := utils.o route.o inetpeer.o protocol.o \
+obj-y     := route.o inetpeer.o protocol.o \
 	     ip_input.o ip_fragment.o ip_forward.o ip_options.o \
 	     ip_output.o ip_sockglue.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c
deleted file mode 100644
index 6aecd7a43534..000000000000
--- a/net/ipv4/utils.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * INET		An implementation of the TCP/IP protocol suite for the LINUX
- *		operating system.  INET is implemented using the  BSD Socket
- *		interface as the means of communication with the user level.
- *
- *		Various kernel-resident INET utility functions; mainly
- *		for format conversion and debugging output.
- *
- * Version:	$Id: utils.c,v 1.8 2000/10/03 07:29:01 anton Exp $
- *
- * Author:	Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
- *
- * Fixes:
- *		Alan Cox	:	verify_area check.
- *		Alan Cox	:	removed old debugging.
- *		Andi Kleen	:	add net_ratelimit()  
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		as published by the Free Software Foundation; either version
- *		2 of the License, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <asm/byteorder.h>
-
-/*
- *	Convert an ASCII string to binary IP. 
- */
- 
-__u32 in_aton(const char *str)
-{
-	unsigned long l;
-	unsigned int val;
-	int i;
-
-	l = 0;
-	for (i = 0; i < 4; i++) 
-	{
-		l <<= 8;
-		if (*str != '\0') 
-		{
-			val = 0;
-			while (*str != '\0' && *str != '.') 
-			{
-				val *= 10;
-				val += *str - '0';
-				str++;
-			}
-			l |= val;
-			if (*str != '\0') 
-				str++;
-		}
-	}
-	return(htonl(l));
-}
-
-EXPORT_SYMBOL(in_aton);
-- 
cgit 


From 44456d37b59d8e541936ed26d8b6e08d27e88ac1 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olh@suse.de>
Date: Wed, 27 Jul 2005 11:45:17 -0700
Subject: [PATCH] turn many #if $undefined_string into #ifdef $undefined_string

turn many #if $undefined_string into #ifdef $undefined_string to fix some
warnings after -Wno-def was added to global CFLAGS

Signed-off-by: Olaf Hering <olh@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/ipv6/ip6_output.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 1f2c2f9e353f..ae652ca14bc9 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -792,13 +792,8 @@ int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 	if (ipv6_addr_any(&fl->fl6_src)) {
 		err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
 
-		if (err) {
-#if IP6_DEBUG >= 2
-			printk(KERN_DEBUG "ip6_dst_lookup: "
-			       "no available source address\n");
-#endif
+		if (err)
 			goto out_err_release;
-		}
 	}
 
 	return 0;
-- 
cgit 


From 77933d7276ee8fa0e2947641941a6f7a100a327b Mon Sep 17 00:00:00 2001
From: Jesper Juhl <juhl@dif.dk>
Date: Wed, 27 Jul 2005 11:46:09 -0700
Subject: [PATCH] clean up inline static vs static inline

`gcc -W' likes to complain if the static keyword is not at the beginning of
the declaration.  This patch fixes all remaining occurrences of "inline
static" up with "static inline" in the entire kernel tree (140 occurrences in
47 files).

While making this change I came across a few lines with trailing whitespace
that I also fixed up, I have also added or removed a blank line or two here
and there, but there are no functional changes in the patch.

Signed-off-by: Jesper Juhl <juhl-lkml@dif.dk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/core/pktgen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 975d651312dc..8eb083b6041a 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -363,7 +363,7 @@ struct pktgen_thread {
  * All Rights Reserved.
  *
  */
-inline static s64 divremdi3(s64 x, s64 y, int type) 
+static inline s64 divremdi3(s64 x, s64 y, int type)
 {
         u64 a = (x < 0) ? -x : x;
         u64 b = (y < 0) ? -y : y;
-- 
cgit 


From 6192b54b845ed05cb838f86ca588cc625c703a09 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 28 Jul 2005 12:12:58 -0700
Subject: [NET]: Fix busy waiting in dev_close().

If the current task has signal_pending(), the loop we have
to wait for the __LINK_STATE_RX_SCHED bit to clear becomes
a pure busy-loop.

Fixed by using msleep() instead of the hand-crafted version.

Noticed by Andrew Morton.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index ff9dc029233a..52a3bf7ae177 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -901,8 +901,7 @@ int dev_close(struct net_device *dev)
 	smp_mb__after_clear_bit(); /* Commit netif_running(). */
 	while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
 		/* No hurry. */
-		current->state = TASK_INTERRUPTIBLE;
-		schedule_timeout(1);
+		msleep(1);
 	}
 
 	/*
-- 
cgit 


From d1b04c081e3fb0a08ac108737e4efa9f4830c916 Mon Sep 17 00:00:00 2001
From: Baruch Even <baruch@ev-en.org>
Date: Sat, 30 Jul 2005 17:41:59 -0700
Subject: [NET]: Spelling mistakes threshoulds -> thresholds

Just simple spelling mistake fixes.

Signed-Off-By: Baruch Even <baruch@ev-en.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 7833d920bdba..dc806b578427 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -362,7 +362,7 @@ out:
 
 /* Fill oifs list. It is called under write locked mrt_lock. */
 
-static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls)
+static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
 {
 	int vifi;
 
@@ -727,7 +727,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
 	if (c != NULL) {
 		write_lock_bh(&mrt_lock);
 		c->mfc_parent = mfc->mfcc_parent;
-		ipmr_update_threshoulds(c, mfc->mfcc_ttls);
+		ipmr_update_thresholds(c, mfc->mfcc_ttls);
 		if (!mrtsock)
 			c->mfc_flags |= MFC_STATIC;
 		write_unlock_bh(&mrt_lock);
@@ -744,7 +744,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
 	c->mfc_origin=mfc->mfcc_origin.s_addr;
 	c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
 	c->mfc_parent=mfc->mfcc_parent;
-	ipmr_update_threshoulds(c, mfc->mfcc_ttls);
+	ipmr_update_thresholds(c, mfc->mfcc_ttls);
 	if (!mrtsock)
 		c->mfc_flags |= MFC_STATIC;
 
-- 
cgit 


From 1f494c0e040b001cf844280910d04ba7ebdc2898 Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Sat, 30 Jul 2005 17:44:07 -0700
Subject: [NETFILTER] Inherit masq_index to slave connections

masq_index is used for cleanup in case the interface address changes
(such as a dialup ppp link with dynamic addreses).  Without this patch,
slave connections are not evicted in such a case, since they don't inherit
masq_index.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_conntrack_core.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 86f04e41dd8e..a7f0c821a9b2 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -512,6 +512,11 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
 		conntrack->master = exp->master;
 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
 		conntrack->mark = exp->master->mark;
+#endif
+#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
+    defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
+		/* this is ugly, but there is no other place where to put it */
+		conntrack->nat.masq_index = exp->master->nat.masq_index;
 #endif
 		nf_conntrack_get(&conntrack->master->ct_general);
 		CONNTRACK_STAT_INC(expect_new);
-- 
cgit 


From db44575f6fd55df6ff67ddd21f7ad5be5a741136 Mon Sep 17 00:00:00 2001
From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Date: Sat, 30 Jul 2005 17:46:44 -0700
Subject: [NET]: fix oops after tunnel module unload

Tunnel modules used to obtain module refcount each time when
some tunnel was created, which meaned that tunnel could be unloaded
only after all the tunnels are deleted.

Since killing old MOD_*_USE_COUNT macros this protection has gone.
It is possible to return it back as module_get/put, but it looks
more natural and practically useful to force destruction of all
the child tunnels on module unload.

Signed-off-by: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 21 ++++++++++++++++++---
 net/ipv4/ipip.c   | 20 ++++++++++++++++++--
 net/ipv6/sit.c    | 21 +++++++++++++++++++--
 3 files changed, 55 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 884835522224..f0d5740d7e22 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -290,7 +290,6 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int
 
 	dev_hold(dev);
 	ipgre_tunnel_link(nt);
-	/* Do not decrement MOD_USE_COUNT here. */
 	return nt;
 
 failed:
@@ -1277,12 +1276,28 @@ err1:
 	goto out;
 }
 
-static void ipgre_fini(void)
+static void __exit ipgre_destroy_tunnels(void)
+{
+	int prio;
+
+	for (prio = 0; prio < 4; prio++) {
+		int h;
+		for (h = 0; h < HASH_SIZE; h++) {
+			struct ip_tunnel *t;
+			while ((t = tunnels[prio][h]) != NULL)
+				unregister_netdevice(t->dev);
+		}
+	}
+}
+
+static void __exit ipgre_fini(void)
 {
 	if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
 
-	unregister_netdev(ipgre_fb_tunnel_dev);
+	rtnl_lock();
+	ipgre_destroy_tunnels();
+	rtnl_unlock();
 }
 
 module_init(ipgre_init);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index c3947cd566b7..c05c1df0bb04 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -255,7 +255,6 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c
 
 	dev_hold(dev);
 	ipip_tunnel_link(nt);
-	/* Do not decrement MOD_USE_COUNT here. */
 	return nt;
 
 failed:
@@ -920,12 +919,29 @@ static int __init ipip_init(void)
 	goto out;
 }
 
+static void __exit ipip_destroy_tunnels(void)
+{
+	int prio;
+
+	for (prio = 1; prio < 4; prio++) {
+		int h;
+		for (h = 0; h < HASH_SIZE; h++) {
+			struct ip_tunnel *t;
+			while ((t = tunnels[prio][h]) != NULL)
+				unregister_netdevice(t->dev);
+		}
+	}
+}
+
 static void __exit ipip_fini(void)
 {
 	if (ipip_unregister() < 0)
 		printk(KERN_INFO "ipip close: can't deregister tunnel\n");
 
-	unregister_netdev(ipip_fb_tunnel_dev);
+	rtnl_lock();
+	ipip_destroy_tunnels();
+	unregister_netdevice(ipip_fb_tunnel_dev);
+	rtnl_unlock();
 }
 
 module_init(ipip_init);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index b788f55e139b..e553e5b80d6e 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -195,7 +195,6 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int
 	dev_hold(dev);
 
 	ipip6_tunnel_link(nt);
-	/* Do not decrement MOD_USE_COUNT here. */
 	return nt;
 
 failed:
@@ -794,10 +793,28 @@ static struct net_protocol sit_protocol = {
 	.err_handler	=	ipip6_err,
 };
 
+static void __exit sit_destroy_tunnels(void)
+{
+	int prio;
+
+	for (prio = 1; prio < 4; prio++) {
+		int h;
+		for (h = 0; h < HASH_SIZE; h++) {
+			struct ip_tunnel *t;
+			while ((t = tunnels[prio][h]) != NULL)
+				unregister_netdevice(t->dev);
+		}
+	}
+}
+
 void __exit sit_cleanup(void)
 {
 	inet_del_protocol(&sit_protocol, IPPROTO_IPV6);
-	unregister_netdev(ipip6_fb_tunnel_dev);
+
+	rtnl_lock();
+	sit_destroy_tunnels();
+	unregister_netdevice(ipip6_fb_tunnel_dev);
+	rtnl_unlock();
 }
 
 int __init sit_init(void)
-- 
cgit 


From f0098f7863f814a5adc0b9cb271605d063cad7fa Mon Sep 17 00:00:00 2001
From: Denis Lunev <den@sw.ru>
Date: Sat, 30 Jul 2005 17:47:25 -0700
Subject: [NET] Fix too aggressive backoff in dst garbage collection

The bug is evident when it is seen once. dst gc timer was backed off,
when gc queue is not empty. But this means that timer quickly backs off,
if at least one destination remains in use. Normally, the bug is invisible,
because adding new dst entry to queue cancels the backoff. But it shots
deadly with destination cache overflow when new destinations are not released
for long time f.e. after an interface goes down.

The fix is to cancel backoff when something was released.

Signed-off-by: Denis Lunev <den@sw.ru>
Signed-off-by: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dst.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/dst.c b/net/core/dst.c
index fc434ade5270..334790da9f16 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -45,6 +45,7 @@ static struct timer_list dst_gc_timer =
 static void dst_run_gc(unsigned long dummy)
 {
 	int    delayed = 0;
+	int    work_performed;
 	struct dst_entry * dst, **dstp;
 
 	if (!spin_trylock(&dst_lock)) {
@@ -52,9 +53,9 @@ static void dst_run_gc(unsigned long dummy)
 		return;
 	}
 
-
 	del_timer(&dst_gc_timer);
 	dstp = &dst_garbage_list;
+	work_performed = 0;
 	while ((dst = *dstp) != NULL) {
 		if (atomic_read(&dst->__refcnt)) {
 			dstp = &dst->next;
@@ -62,6 +63,7 @@ static void dst_run_gc(unsigned long dummy)
 			continue;
 		}
 		*dstp = dst->next;
+		work_performed = 1;
 
 		dst = dst_destroy(dst);
 		if (dst) {
@@ -86,9 +88,14 @@ static void dst_run_gc(unsigned long dummy)
 		dst_gc_timer_inc = DST_GC_MAX;
 		goto out;
 	}
-	if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX)
-		dst_gc_timer_expires = DST_GC_MAX;
-	dst_gc_timer_inc += DST_GC_INC;
+	if (!work_performed) {
+		if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX)
+			dst_gc_timer_expires = DST_GC_MAX;
+		dst_gc_timer_inc += DST_GC_INC;
+	} else {
+		dst_gc_timer_inc = DST_GC_INC;
+		dst_gc_timer_expires = DST_GC_MIN;
+	}
 	dst_gc_timer.expires = jiffies + dst_gc_timer_expires;
 #if RT_CACHE_DEBUG >= 2
 	printk("dst_total: %d/%d %ld\n",
-- 
cgit 


From 846998ae87a80b0fd45b4cf5cf001a159d746f27 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 4 Aug 2005 19:52:01 -0700
Subject: [PATCH] tcp: fix TSO sizing bugs

MSS changes can be lost since we preemptively initialize the tso_segs count
for an SKB before we %100 commit to sending it out.

So, by the time we send it out, the tso_size information can be stale due
to PMTU events.  This mucks up all of the logic in our send engine, and can
even result in the BUG() triggering in tcp_tso_should_defer().

Another problem we have is that we're storing the tp->mss_cache, not the
SACK block normalized MSS, as the tso_size.  That's wrong too.

Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/ipv4/tcp_output.c | 56 +++++++++++++++++++++++++--------------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e3f8ea1bfa9c..e118b4b5b326 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -403,11 +403,9 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
 		sk->sk_send_head = skb;
 }
 
-static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
+static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (skb->len <= tp->mss_cache ||
+	if (skb->len <= mss_now ||
 	    !(sk->sk_route_caps & NETIF_F_TSO)) {
 		/* Avoid the costly divide in the normal
 		 * non-TSO case.
@@ -417,10 +415,10 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
 	} else {
 		unsigned int factor;
 
-		factor = skb->len + (tp->mss_cache - 1);
-		factor /= tp->mss_cache;
+		factor = skb->len + (mss_now - 1);
+		factor /= mss_now;
 		skb_shinfo(skb)->tso_segs = factor;
-		skb_shinfo(skb)->tso_size = tp->mss_cache;
+		skb_shinfo(skb)->tso_size = mss_now;
 	}
 }
 
@@ -429,7 +427,7 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
  * packet to the list.  This won't be called frequently, I hope. 
  * Remember, these are still headerless SKBs at this point.
  */
-static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
+static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *buff;
@@ -492,8 +490,8 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 	}
 
 	/* Fix up tso_factor for both original and new SKB.  */
-	tcp_set_skb_tso_segs(sk, skb);
-	tcp_set_skb_tso_segs(sk, buff);
+	tcp_set_skb_tso_segs(sk, skb, mss_now);
+	tcp_set_skb_tso_segs(sk, buff, mss_now);
 
 	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
 		tp->lost_out += tcp_skb_pcount(skb);
@@ -569,7 +567,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 	 * factor and mss.
 	 */
 	if (tcp_skb_pcount(skb) > 1)
-		tcp_set_skb_tso_segs(sk, skb);
+		tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
 
 	return 0;
 }
@@ -734,12 +732,14 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk
 /* This must be invoked the first time we consider transmitting
  * SKB onto the wire.
  */
-static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
+static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
 {
 	int tso_segs = tcp_skb_pcount(skb);
 
-	if (!tso_segs) {
-		tcp_set_skb_tso_segs(sk, skb);
+	if (!tso_segs ||
+	    (tso_segs > 1 &&
+	     skb_shinfo(skb)->tso_size != mss_now)) {
+		tcp_set_skb_tso_segs(sk, skb, mss_now);
 		tso_segs = tcp_skb_pcount(skb);
 	}
 	return tso_segs;
@@ -817,7 +817,7 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned int cwnd_quota;
 
-	tcp_init_tso_segs(sk, skb);
+	tcp_init_tso_segs(sk, skb, cur_mss);
 
 	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
 		return 0;
@@ -854,7 +854,7 @@ int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
  * know that all the data is in scatter-gather pages, and that the
  * packet has never been sent out before (and thus is not cloned).
  */
-static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now)
 {
 	struct sk_buff *buff;
 	int nlen = skb->len - len;
@@ -887,8 +887,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
 	skb_split(skb, buff, len);
 
 	/* Fix up tso_factor for both original and new SKB.  */
-	tcp_set_skb_tso_segs(sk, skb);
-	tcp_set_skb_tso_segs(sk, buff);
+	tcp_set_skb_tso_segs(sk, skb, mss_now);
+	tcp_set_skb_tso_segs(sk, buff, mss_now);
 
 	/* Link BUFF into the send queue. */
 	skb_header_release(buff);
@@ -976,7 +976,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 	if (unlikely(!skb))
 		return 0;
 
-	tso_segs = tcp_init_tso_segs(sk, skb);
+	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 	cwnd_quota = tcp_cwnd_test(tp, skb);
 	if (unlikely(!cwnd_quota))
 		goto out;
@@ -1006,11 +1006,11 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 					limit = skb->len - trim;
 			}
 			if (skb->len > limit) {
-				if (tso_fragment(sk, skb, limit))
+				if (tso_fragment(sk, skb, limit, mss_now))
 					break;
 			}
 		} else if (unlikely(skb->len > mss_now)) {
-			if (unlikely(tcp_fragment(sk, skb,  mss_now)))
+			if (unlikely(tcp_fragment(sk, skb,  mss_now, mss_now)))
 				break;
 		}
 
@@ -1039,7 +1039,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 		skb = sk->sk_send_head;
 		if (!skb)
 			break;
-		tso_segs = tcp_init_tso_segs(sk, skb);
+		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 	}
 
 	if (likely(sent_pkts)) {
@@ -1076,7 +1076,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
 
 	BUG_ON(!skb || skb->len < mss_now);
 
-	tso_segs = tcp_init_tso_segs(sk, skb);
+	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 	cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
 
 	if (likely(cwnd_quota)) {
@@ -1093,11 +1093,11 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
 					limit = skb->len - trim;
 			}
 			if (skb->len > limit) {
-				if (unlikely(tso_fragment(sk, skb, limit)))
+				if (unlikely(tso_fragment(sk, skb, limit, mss_now)))
 					return;
 			}
 		} else if (unlikely(skb->len > mss_now)) {
-			if (unlikely(tcp_fragment(sk, skb, mss_now)))
+			if (unlikely(tcp_fragment(sk, skb, mss_now, mss_now)))
 				return;
 		}
 
@@ -1388,7 +1388,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 		int old_factor = tcp_skb_pcount(skb);
 		int new_factor;
 
-		if (tcp_fragment(sk, skb, cur_mss))
+		if (tcp_fragment(sk, skb, cur_mss, cur_mss))
 			return -ENOMEM; /* We'll try again later. */
 
 		/* New SKB created, account for it. */
@@ -1991,7 +1991,7 @@ int tcp_write_wakeup(struct sock *sk)
 			    skb->len > mss) {
 				seg_size = min(seg_size, mss);
 				TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
-				if (tcp_fragment(sk, skb, seg_size))
+				if (tcp_fragment(sk, skb, seg_size, mss))
 					return -1;
 				/* SWS override triggered forced fragmentation.
 				 * Disable TSO, the connection is too sick. */
@@ -2000,7 +2000,7 @@ int tcp_write_wakeup(struct sock *sk)
 					sk->sk_route_caps &= ~NETIF_F_TSO;
 				}
 			} else if (!tcp_skb_pcount(skb))
-				tcp_set_skb_tso_segs(sk, skb);
+				tcp_set_skb_tso_segs(sk, skb, mss);
 
 			TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 			TCP_SKB_CB(skb)->when = tcp_time_stamp;
-- 
cgit 


From b68e9f857271189bd7a59b74c99890de9195b0e1 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 4 Aug 2005 19:52:02 -0700
Subject: [PATCH] tcp: fix TSO cwnd caching bug

tcp_write_xmit caches the cwnd value indirectly in cwnd_quota.  When
tcp_transmit_skb reduces the cwnd because of tcp_enter_cwr, the cached
value becomes invalid.

This patch ensures that the cwnd value is always reread after each
tcp_transmit_skb call.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/ipv4/tcp_output.c | 34 +++++++++-------------------------
 1 file changed, 9 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e118b4b5b326..7d076f0db100 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -972,19 +972,18 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 	if (unlikely(sk->sk_state == TCP_CLOSE))
 		return 0;
 
-	skb = sk->sk_send_head;
-	if (unlikely(!skb))
-		return 0;
-
-	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
-	cwnd_quota = tcp_cwnd_test(tp, skb);
-	if (unlikely(!cwnd_quota))
-		goto out;
-
 	sent_pkts = 0;
-	while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
+	while ((skb = sk->sk_send_head)) {
+		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 		BUG_ON(!tso_segs);
 
+		cwnd_quota = tcp_cwnd_test(tp, skb);
+		if (!cwnd_quota)
+			break;
+
+		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+			break;
+
 		if (tso_segs == 1) {
 			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
 						     (tcp_skb_is_last(sk, skb) ?
@@ -1026,27 +1025,12 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 
 		tcp_minshall_update(tp, mss_now, skb);
 		sent_pkts++;
-
-		/* Do not optimize this to use tso_segs. If we chopped up
-		 * the packet above, tso_segs will no longer be valid.
-		 */
-		cwnd_quota -= tcp_skb_pcount(skb);
-
-		BUG_ON(cwnd_quota < 0);
-		if (!cwnd_quota)
-			break;
-
-		skb = sk->sk_send_head;
-		if (!skb)
-			break;
-		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 	}
 
 	if (likely(sent_pkts)) {
 		tcp_cwnd_validate(sk, tp);
 		return 0;
 	}
-out:
 	return !tp->packets_out && sk->sk_send_head;
 }
 
-- 
cgit 


From b7656e7f2944984befa3ab99a5b99f99a23b302b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 5 Aug 2005 04:12:48 -0700
Subject: [IPV4]: Fix memory leak during fib_info hash expansion.

When we grow the tables, we forget to free the olds ones
up.

Noticed by Yan Zheng.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c886b28ba9f5..e278cb9d0075 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -593,10 +593,13 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
 			  struct hlist_head *new_laddrhash,
 			  unsigned int new_size)
 {
+	struct hlist_head *old_info_hash, *old_laddrhash;
 	unsigned int old_size = fib_hash_size;
-	unsigned int i;
+	unsigned int i, bytes;
 
 	write_lock(&fib_info_lock);
+	old_info_hash = fib_info_hash;
+	old_laddrhash = fib_info_laddrhash;
 	fib_hash_size = new_size;
 
 	for (i = 0; i < old_size; i++) {
@@ -636,6 +639,10 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
 	fib_info_laddrhash = new_laddrhash;
 
 	write_unlock(&fib_info_lock);
+
+	bytes = old_size * sizeof(struct hlist_head *);
+	fib_hash_free(old_info_hash, bytes);
+	fib_hash_free(old_laddrhash, bytes);
 }
 
 struct fib_info *
-- 
cgit 


From dcc365d8f28d6a2332fa37e64d669858a8d017e8 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Sat, 6 Aug 2005 12:36:42 +0200
Subject: [Bluetooth] Revert session reference counting fix

The fix for the reference counting problem of the signal DLC introduced
a race condition which leads to an oops. The reason for it is not fully
understood by now and so revert this fix, because the reference counting
problem is not crashing the RFCOMM layer and its appearance it rare.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/rfcomm/core.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index e9e6fda66f1a..27bf5047cd33 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -389,8 +389,6 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
 		rfcomm_dlc_unlock(d);
 
 		skb_queue_purge(&d->tx_queue);
-		rfcomm_session_put(s);
-
 		rfcomm_dlc_unlink(d);
 	}
 
@@ -600,8 +598,6 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst
 		goto failed;
 	}
 
-	rfcomm_session_hold(s);
-
 	s->initiator = 1;
 
 	bacpy(&addr.l2_bdaddr, dst);
-- 
cgit 


From 66e8b6c31b9254243afaac8af4135e84e11dd38e Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Sat, 6 Aug 2005 12:36:51 +0200
Subject: [Bluetooth] Remove unused functions and cleanup symbol exports

This patch removes the unused bt_dump() function and it also removes
its BT_DMP macro. It also unexports the hci_dev_get(), hci_send_cmd()
and hci_si_event() functions.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_core.c  |  2 --
 net/bluetooth/hci_event.c |  1 -
 net/bluetooth/lib.c       | 25 -------------------------
 3 files changed, 28 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index fb5524365bc2..ffa26c10bfe8 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -299,7 +299,6 @@ struct hci_dev *hci_dev_get(int index)
 	read_unlock(&hci_dev_list_lock);
 	return hdev;
 }
-EXPORT_SYMBOL(hci_dev_get);
 
 /* ---- Inquiry support ---- */
 static void inquiry_cache_flush(struct hci_dev *hdev)
@@ -1042,7 +1041,6 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 ogf, __u16 ocf, __u32 plen, void *p
 
 	return 0;
 }
-EXPORT_SYMBOL(hci_send_cmd);
 
 /* Get data from the previously sent command */
 void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 ogf, __u16 ocf)
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index c4b592b4ef10..26050d13fb47 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1040,4 +1040,3 @@ void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
 	hci_send_to_sock(hdev, skb);
 	kfree_skb(skb);
 }
-EXPORT_SYMBOL(hci_si_event);
diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c
index 9efb0a093612..ee6a66979913 100644
--- a/net/bluetooth/lib.c
+++ b/net/bluetooth/lib.c
@@ -34,31 +34,6 @@
 
 #include <net/bluetooth/bluetooth.h>
 
-void bt_dump(char *pref, __u8 *buf, int count)
-{
-	char *ptr;
-	char line[100];
-	unsigned int i;
-
-	printk(KERN_INFO "%s: dump, len %d\n", pref, count);
-
-	ptr = line;
-	*ptr = 0;
-	for (i = 0; i < count; i++) {
-		ptr += sprintf(ptr, " %2.2X", buf[i]);
-
-		if (i && !((i + 1) % 20)) {
-			printk(KERN_INFO "%s:%s\n", pref, line);
-			ptr = line;
-			*ptr = 0;
-		}
-	}
-
-	if (line[0])
-		printk(KERN_INFO "%s:%s\n", pref, line);
-}
-EXPORT_SYMBOL(bt_dump);
-
 void baswap(bdaddr_t *dst, bdaddr_t *src)
 {
 	unsigned char *d = (unsigned char *) dst;
-- 
cgit 


From 576c7d858f36cab6110b29db7b53964d5132cf30 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Sat, 6 Aug 2005 12:36:54 +0200
Subject: [Bluetooth] Add direction and timestamp to stack internal events

This patch changes the direction to incoming and adds the timestamp
to all stack internal events.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_event.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 26050d13fb47..46367bd129c3 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1035,6 +1035,9 @@ void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
 	ev->type = type;
 	memcpy(ev->data, data, dlen);
 
+	bt_cb(skb)->incoming = 1;
+	do_gettimeofday(&skb->stamp);
+
 	skb->pkt_type = HCI_EVENT_PKT;
 	skb->dev = (void *) hdev;
 	hci_send_to_sock(hdev, skb);
-- 
cgit 


From 6fc0b4a7a73a81e74d0004732df358f4f9975be2 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 6 Aug 2005 06:33:15 -0700
Subject: [IPSEC]: Restrict socket policy loading to CAP_NET_ADMIN.

The interface needs much redesigning if we wish to allow
normal users to do this in some way.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c   | 3 +++
 net/ipv6/ipv6_sockglue.c | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index fc7c481d0d79..ff4bd067b397 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -848,6 +848,9 @@ mc_msf_out:
  
 		case IP_IPSEC_POLICY:
 		case IP_XFRM_POLICY:
+			err = -EPERM;
+			if (!capable(CAP_NET_ADMIN))
+				break;
 			err = xfrm_user_policy(sk, optname, optval, optlen);
 			break;
 
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index f3ef4c38d315..3bc144a79fa5 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -504,6 +504,9 @@ done:
 		break;
 	case IPV6_IPSEC_POLICY:
 	case IPV6_XFRM_POLICY:
+		retv = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			break;
 		retv = xfrm_user_policy(sk, optname, optval, optlen);
 		break;
 
-- 
cgit 


From 8b83bc77bf77cc8459cb94e52b08e775104c4c48 Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Mon, 8 Aug 2005 11:50:55 +0200
Subject: [PATCH] don't try to do any NAT on untracked connections

With the introduction of 'rustynat' in 2.6.11, the old tricks of preventing
NAT of 'untracked' connections (e.g. NOTRACK target in 'raw' table) are no
longer sufficient.

The ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK effectively
prevents iteration of the 'nat' table, but doesn't prevent nat_packet()
to be executed.  Since nr_manips is gone in 'rustynat', nat_packet() now
implicitly thinks that it has to do NAT on the packet.

This patch fixes that problem by explicitly checking for
ip_conntrack_untracked in ip_nat_fn().

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/ipv4/netfilter/ip_nat_standalone.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index bc59d0d6e89e..91d5ea1dbbc9 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -102,6 +102,10 @@ ip_nat_fn(unsigned int hooknum,
 		return NF_ACCEPT;
 	}
 
+	/* Don't try to NAT if this packet is not conntracked */
+	if (ct == &ip_conntrack_untracked)
+		return NF_ACCEPT;
+
 	switch (ctinfo) {
 	case IP_CT_RELATED:
 	case IP_CT_RELATED+IP_CT_IS_REPLY:
-- 
cgit 


From ca9334523c853e407da7b3a0bd02f54d0fa59414 Mon Sep 17 00:00:00 2001
From: Heikki Orsila <heikki.orsila@iki.fi>
Date: Mon, 8 Aug 2005 14:26:52 -0700
Subject: [IPV4]: Debug cleanup

Here's a small patch to cleanup NETDEBUG() use in net/ipv4/ for Linux
kernel 2.6.13-rc5. Also weird use of indentation is changed in some
places.

Signed-off-by: Heikki Orsila <heikki.orsila@iki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c        |  3 +--
 net/ipv4/ip_fragment.c |  8 +++-----
 net/ipv4/tcp_ipv4.c    | 14 ++++++--------
 net/ipv4/udp.c         | 34 ++++++++++++++++------------------
 4 files changed, 26 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 279f57abfecb..3d78464f64ea 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -936,8 +936,7 @@ int icmp_rcv(struct sk_buff *skb)
 	case CHECKSUM_HW:
 		if (!(u16)csum_fold(skb->csum))
 			break;
-		NETDEBUG(if (net_ratelimit())
-				printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
+		LIMIT_NETDEBUG(printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
 	case CHECKSUM_NONE:
 		if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
 			goto error;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 7f68e27eb4ea..eb377ae15305 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -377,7 +377,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
 	return ip_frag_intern(hash, qp);
 
 out_nomem:
-	NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n"));
+	LIMIT_NETDEBUG(printk(KERN_ERR "ip_frag_create: no memory left !\n"));
 	return NULL;
 }
 
@@ -625,10 +625,8 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
 	return head;
 
 out_nomem:
- 	NETDEBUG(if (net_ratelimit())
-	         printk(KERN_ERR 
-			"IP: queue_glue: no memory for gluing queue %p\n",
-			qp));
+ 	LIMIT_NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing "
+			      "queue %p\n", qp));
 	goto out_fail;
 out_oversize:
 	if (net_ratelimit())
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 62f62bb05c2a..5d91213d34c0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1494,12 +1494,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 			 * to destinations, already remembered
 			 * to the moment of synflood.
 			 */
-			NETDEBUG(if (net_ratelimit()) \
-					printk(KERN_DEBUG "TCP: drop open "
-							  "request from %u.%u."
-							  "%u.%u/%u\n", \
-					       NIPQUAD(saddr),
-					       ntohs(skb->h.th->source)));
+			LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
+					      "request from %u.%u."
+					      "%u.%u/%u\n",
+					      NIPQUAD(saddr),
+					      ntohs(skb->h.th->source)));
 			dst_release(dst);
 			goto drop_and_free;
 		}
@@ -1627,8 +1626,7 @@ static int tcp_v4_checksum_init(struct sk_buff *skb)
 				  skb->nh.iph->daddr, skb->csum))
 			return 0;
 
-		NETDEBUG(if (net_ratelimit())
-				printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
+		LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
 		skb->ip_summed = CHECKSUM_NONE;
 	}
 	if (skb->len <= 76) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7c24e64b443f..dc4d07357e3a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -628,7 +628,7 @@ back_from_confirm:
 		/* ... which is an evident application bug. --ANK */
 		release_sock(sk);
 
-		NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n"));
+		LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n"));
 		err = -EINVAL;
 		goto out;
 	}
@@ -693,7 +693,7 @@ static int udp_sendpage(struct sock *sk, struct page *page, int offset,
 	if (unlikely(!up->pending)) {
 		release_sock(sk);
 
-		NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 3\n"));
+		LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 3\n"));
 		return -EINVAL;
 	}
 
@@ -1102,7 +1102,7 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 		if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
 			return 0;
-		NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp v4 hw csum failure.\n"));
+		LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v4 hw csum failure.\n"));
 		skb->ip_summed = CHECKSUM_NONE;
 	}
 	if (skb->ip_summed != CHECKSUM_UNNECESSARY)
@@ -1181,14 +1181,13 @@ int udp_rcv(struct sk_buff *skb)
 	return(0);
 
 short_packet:
-	NETDEBUG(if (net_ratelimit())
-		printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
-			NIPQUAD(saddr),
-			ntohs(uh->source),
-			ulen,
-			len,
-			NIPQUAD(daddr),
-			ntohs(uh->dest)));
+	LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
+			      NIPQUAD(saddr),
+			      ntohs(uh->source),
+			      ulen,
+			      len,
+			      NIPQUAD(daddr),
+			      ntohs(uh->dest)));
 no_header:
 	UDP_INC_STATS_BH(UDP_MIB_INERRORS);
 	kfree_skb(skb);
@@ -1199,13 +1198,12 @@ csum_error:
 	 * RFC1122: OK.  Discards the bad packet silently (as far as 
 	 * the network is concerned, anyway) as per 4.1.3.4 (MUST). 
 	 */
-	NETDEBUG(if (net_ratelimit())
-		 printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
-			NIPQUAD(saddr),
-			ntohs(uh->source),
-			NIPQUAD(daddr),
-			ntohs(uh->dest),
-			ulen));
+	LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
+			      NIPQUAD(saddr),
+			      ntohs(uh->source),
+			      NIPQUAD(daddr),
+			      ntohs(uh->dest),
+			      ulen));
 drop:
 	UDP_INC_STATS_BH(UDP_MIB_INERRORS);
 	kfree_skb(skb);
-- 
cgit 


From 3501466941347f0e1992b2672affb3feb92925fd Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 9 Aug 2005 14:57:12 -0700
Subject: [SUNRPC]: Fix nsec --> usec conversion.

We need to divide, not multiply.  While we're here,
use NSEC_PER_USEC instead of a magic constant.

Based upon a report from Josip Loncaric and a patch
by Andrew Morton.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sunrpc/svcsock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 56db8f13e6cb..d0c3120d0233 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -586,7 +586,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
 	}
 	if (skb->stamp.tv_sec == 0) {
 		skb->stamp.tv_sec = xtime.tv_sec; 
-		skb->stamp.tv_usec = xtime.tv_nsec * 1000; 
+		skb->stamp.tv_usec = xtime.tv_nsec / NSEC_PER_USEC; 
 		/* Don't enable netstamp, sunrpc doesn't 
 		   need that much accuracy */
 	}
-- 
cgit 


From d64d3873721cfe870d49d73c3744f06260779ce7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 9 Aug 2005 15:29:19 -0700
Subject: [NET]: Fix memory leak in sys_{send,recv}msg() w/compat

From: Dave Johnson <djohnson+linux-kernel@sw.starentnetworks.com>

sendmsg()/recvmsg() syscalls from o32/n32 apps to a 64bit kernel will
cause a kernel memory leak if iov_len > UIO_FASTIOV for each syscall!

This is because both sys_sendmsg() and verify_compat_iovec() kmalloc a
new iovec structure.  Only the one from sys_sendmsg() is free'ed.

I wrote a simple test program to confirm this after identifying the
problem:

http://davej.org/programs/testsendmsg.c

Note that the below fix will break solaris_sendmsg()/solaris_recvmsg() as
it also calls verify_compat_iovec() but expects it to malloc internally.

[ I fixed that. -DaveM ]

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/compat.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'net')

diff --git a/net/compat.c b/net/compat.c
index be5d936dc423..d99ab9695893 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -91,20 +91,11 @@ int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov,
 	} else
 		kern_msg->msg_name = NULL;
 
-	if(kern_msg->msg_iovlen > UIO_FASTIOV) {
-		kern_iov = kmalloc(kern_msg->msg_iovlen * sizeof(struct iovec),
-				   GFP_KERNEL);
-		if(!kern_iov)
-			return -ENOMEM;
-	}
-
 	tot_len = iov_from_user_compat_to_kern(kern_iov,
 					  (struct compat_iovec __user *)kern_msg->msg_iov,
 					  kern_msg->msg_iovlen);
 	if(tot_len >= 0)
 		kern_msg->msg_iov = kern_iov;
-	else if(kern_msg->msg_iovlen > UIO_FASTIOV)
-		kfree(kern_iov);
 
 	return tot_len;
 }
-- 
cgit 


From 001ab02a8c04f0b4dc773c474da698ad7405ae68 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <steve@chygwyn.com>
Date: Wed, 10 Aug 2005 11:32:57 -0700
Subject: [DECNET]: Use sk_stream_error function rather than DECnet's own

Signed-off-by: Steven Whitehouse <steve@chygwyn.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/decnet/af_decnet.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 96a02800cd28..acdd18e6adb2 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1876,15 +1876,6 @@ static inline unsigned int dn_current_mss(struct sock *sk, int flags)
 	return mss_now;
 }
 
-static int dn_error(struct sock *sk, int flags, int err)
-{
-	if (err == -EPIPE)
-		err = sock_error(sk) ? : -EPIPE;
-	if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
-		send_sig(SIGPIPE, current, 0);
-	return err;
-}
-
 static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
 	   struct msghdr *msg, size_t size)
 {
@@ -2045,7 +2036,7 @@ out:
 	return sent ? sent : err;
 
 out_err:
-	err = dn_error(sk, flags, err);
+	err = sk_stream_error(sk, flags, err);
 	release_sock(sk);
 	return err;
 }
-- 
cgit 


From b5da623ae9be680ea59f268eeb339f0acb2d88c4 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 10 Aug 2005 18:32:36 -0700
Subject: [TCP]: Adjust {p,f}ackets_out correctly in tcp_retransmit_skb()

Well I've only found one potential cause for the assertion
failure in tcp_mark_head_lost.  First of all, this can only
occur if cnt > 1 since tp->packets_out is never zero here.
If it did hit zero we'd have much bigger problems.

So cnt is equal to fackets_out - reordering.  Normally
fackets_out is less than packets_out.  The only reason
I've found that might cause fackets_out to exceed packets_out
is if tcp_fragment is called from tcp_retransmit_skb with a
TSO skb and the current MSS is greater than the MSS stored
in the TSO skb.  This might occur as the result of an expiring
dst entry.

In that case, packets_out may decrease (line 1380-1381 in
tcp_output.c).  However, fackets_out is unchanged which means
that it may in fact exceed packets_out.

Previously tcp_retrans_try_collapse was the only place where
packets_out can go down and it takes care of this by decrementing
fackets_out.

So we should make sure that fackets_out is reduced by an appropriate
amount here as well.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7d076f0db100..3ed6fc15815b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1370,15 +1370,21 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 
 	if (skb->len > cur_mss) {
 		int old_factor = tcp_skb_pcount(skb);
-		int new_factor;
+		int diff;
 
 		if (tcp_fragment(sk, skb, cur_mss, cur_mss))
 			return -ENOMEM; /* We'll try again later. */
 
 		/* New SKB created, account for it. */
-		new_factor = tcp_skb_pcount(skb);
-		tp->packets_out -= old_factor - new_factor;
-		tp->packets_out += tcp_skb_pcount(skb->next);
+		diff = old_factor - tcp_skb_pcount(skb) -
+		       tcp_skb_pcount(skb->next);
+		tp->packets_out -= diff;
+
+		if (diff > 0) {
+			tp->fackets_out -= diff;
+			if ((int)tp->fackets_out < 0)
+				tp->fackets_out = 0;
+		}
 	}
 
 	/* Collapse two adjacent packets if worthwhile and we can. */
-- 
cgit 


From 11513128bb66b0b09d5d0df069b58afdb01752a2 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Thu, 11 Aug 2005 19:23:04 -0700
Subject: [NETPOLL]: rx_flags bugfix

Initialize npinfo->rx_flags.  The way it stands now, this will have random
garbage, and so will incur a locking penalty even when an rx_hook isn't
registered and we are not active in the netpoll polling code.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index c327c9edadc5..895f3efc65aa 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -639,6 +639,7 @@ int netpoll_setup(struct netpoll *np)
 		if (!npinfo)
 			goto release;
 
+		npinfo->rx_flags = 0;
 		npinfo->rx_np = NULL;
 		npinfo->poll_lock = SPIN_LOCK_UNLOCKED;
 		npinfo->poll_owner = -1;
-- 
cgit 


From a636e1357911afdea7c8344ee65f78d36caf3c16 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Thu, 11 Aug 2005 19:23:50 -0700
Subject: [NETPOLL]: deadlock bugfix

This fixes an obvious deadlock in the netpoll code.  netpoll_rx takes the
npinfo->rx_lock.  netpoll_rx is also the only caller of arp_reply (through
__netpoll_rx).  As such, it is not necessary to take this lock.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 895f3efc65aa..b9d9da082af2 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -353,11 +353,8 @@ static void arp_reply(struct sk_buff *skb)
 	struct sk_buff *send_skb;
 	struct netpoll *np = NULL;
 
-	spin_lock_irqsave(&npinfo->rx_lock, flags);
 	if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev)
 		np = npinfo->rx_np;
-	spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-
 	if (!np)
 		return;
 
-- 
cgit 


From f0d3459d0722782c7d9d0e35a1ed0815e75fcde5 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Thu, 11 Aug 2005 19:25:11 -0700
Subject: [NETPOLL]: netpoll_send_skb simplify

Minor netpoll_send_skb restructuring

Restructure to avoid confusing goto and move some bits out of the
retry loop.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 42 ++++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index b9d9da082af2..59ed186e4f46 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -248,14 +248,14 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 	int status;
 	struct netpoll_info *npinfo;
 
-repeat:
-	if(!np || !np->dev || !netif_running(np->dev)) {
+	if (!np || !np->dev || !netif_running(np->dev)) {
 		__kfree_skb(skb);
 		return;
 	}
 
-	/* avoid recursion */
 	npinfo = np->dev->npinfo;
+
+	/* avoid recursion */
 	if (npinfo->poll_owner == smp_processor_id() ||
 	    np->dev->xmit_lock_owner == smp_processor_id()) {
 		if (np->drop)
@@ -265,29 +265,31 @@ repeat:
 		return;
 	}
 
-	spin_lock(&np->dev->xmit_lock);
-	np->dev->xmit_lock_owner = smp_processor_id();
+	while (1) {
+		spin_lock(&np->dev->xmit_lock);
+		np->dev->xmit_lock_owner = smp_processor_id();
 
-	/*
-	 * network drivers do not expect to be called if the queue is
-	 * stopped.
-	 */
-	if (netif_queue_stopped(np->dev)) {
+		/*
+		 * network drivers do not expect to be called if the queue is
+		 * stopped.
+		 */
+		if (netif_queue_stopped(np->dev)) {
+			np->dev->xmit_lock_owner = -1;
+			spin_unlock(&np->dev->xmit_lock);
+			netpoll_poll(np);
+			continue;
+		}
+
+		status = np->dev->hard_start_xmit(skb, np->dev);
 		np->dev->xmit_lock_owner = -1;
 		spin_unlock(&np->dev->xmit_lock);
 
-		netpoll_poll(np);
-		goto repeat;
-	}
-
-	status = np->dev->hard_start_xmit(skb, np->dev);
-	np->dev->xmit_lock_owner = -1;
-	spin_unlock(&np->dev->xmit_lock);
+		/* success */
+		if(!status)
+			return;
 
-	/* transmit busy */
-	if(status) {
+		/* transmit busy */
 		netpoll_poll(np);
-		goto repeat;
 	}
 }
 
-- 
cgit 


From 0db1d6fc1ea051af49ebe03c503d23996a7c5bbb Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Thu, 11 Aug 2005 19:25:54 -0700
Subject: [NETPOLL]: add retry timeout

Add limited retry logic to netpoll_send_skb

Each time we attempt to send, decrement our per-device retry counter.
On every successful send, we reset the counter.

We delay 50us between attempts with up to 20000 retries for a total of
1 second. After we've exhausted our retries, subsequent failed
attempts will try only once until reset by success.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 59ed186e4f46..d09affdbad3c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -33,6 +33,7 @@
 #define MAX_UDP_CHUNK 1460
 #define MAX_SKBS 32
 #define MAX_QUEUE_DEPTH (MAX_SKBS / 2)
+#define MAX_RETRIES 20000
 
 static DEFINE_SPINLOCK(skb_list_lock);
 static int nr_skbs;
@@ -265,7 +266,8 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 		return;
 	}
 
-	while (1) {
+	do {
+		npinfo->tries--;
 		spin_lock(&np->dev->xmit_lock);
 		np->dev->xmit_lock_owner = smp_processor_id();
 
@@ -277,6 +279,7 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 			np->dev->xmit_lock_owner = -1;
 			spin_unlock(&np->dev->xmit_lock);
 			netpoll_poll(np);
+			udelay(50);
 			continue;
 		}
 
@@ -285,12 +288,15 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 		spin_unlock(&np->dev->xmit_lock);
 
 		/* success */
-		if(!status)
+		if(!status) {
+			npinfo->tries = MAX_RETRIES; /* reset */
 			return;
+		}
 
 		/* transmit busy */
 		netpoll_poll(np);
-	}
+		udelay(50);
+	} while (npinfo->tries > 0);
 }
 
 void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
@@ -642,6 +648,7 @@ int netpoll_setup(struct netpoll *np)
 		npinfo->rx_np = NULL;
 		npinfo->poll_lock = SPIN_LOCK_UNLOCKED;
 		npinfo->poll_owner = -1;
+		npinfo->tries = MAX_RETRIES;
 		npinfo->rx_lock = SPIN_LOCK_UNLOCKED;
 	} else
 		npinfo = ndev->npinfo;
-- 
cgit 


From 2652076507b662fc88ba16c27b59c7bdd9ccd956 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Aug 2005 19:26:42 -0700
Subject: [NETPOLL]: pre-fill skb pool

we could do one thing (see the patch below): i think it would be useful
to fill up the netlogging skb queue straight at initialization time.
Especially if netpoll is used for dumping alone, the system might not be
in a situation to fill up the queue at the point of crash, so better be
a bit more prepared and keep the pipeline filled.

[ I've modified this to be called earlier - mpm ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index d09affdbad3c..c02a08da6d42 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -725,6 +725,10 @@ int netpoll_setup(struct netpoll *np)
 		npinfo->rx_np = np;
 		spin_unlock_irqrestore(&npinfo->rx_lock, flags);
 	}
+
+	/* fill up the skb queue */
+	refill_skbs();
+
 	/* last thing to do is link it to the net device structure */
 	ndev->npinfo = npinfo;
 
-- 
cgit 


From 53fb95d3c14290fd6ee808b221e35493f096246f Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Thu, 11 Aug 2005 19:27:43 -0700
Subject: [NETPOLL]: fix initialization/NAPI race

This fixes a race during initialization with the NAPI softirq
processing by using an RCU approach.

This race was discovered when refill_skbs() was added to
the setup code.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c     | 9 +++++----
 net/core/netpoll.c | 3 +++
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 52a3bf7ae177..faf59b02c4bf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1696,7 +1696,8 @@ static void net_rx_action(struct softirq_action *h)
 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
 	unsigned long start_time = jiffies;
 	int budget = netdev_budget;
-	
+	void *have;
+
 	local_irq_disable();
 
 	while (!list_empty(&queue->poll_list)) {
@@ -1709,10 +1710,10 @@ static void net_rx_action(struct softirq_action *h)
 
 		dev = list_entry(queue->poll_list.next,
 				 struct net_device, poll_list);
-		netpoll_poll_lock(dev);
+		have = netpoll_poll_lock(dev);
 
 		if (dev->quota <= 0 || dev->poll(dev, &budget)) {
-			netpoll_poll_unlock(dev);
+			netpoll_poll_unlock(have);
 			local_irq_disable();
 			list_del(&dev->poll_list);
 			list_add_tail(&dev->poll_list, &queue->poll_list);
@@ -1721,7 +1722,7 @@ static void net_rx_action(struct softirq_action *h)
 			else
 				dev->quota = dev->weight;
 		} else {
-			netpoll_poll_unlock(dev);
+			netpoll_poll_unlock(have);
 			dev_put(dev);
 			local_irq_disable();
 		}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index c02a08da6d42..996787bca17f 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -732,6 +732,9 @@ int netpoll_setup(struct netpoll *np)
 	/* last thing to do is link it to the net device structure */
 	ndev->npinfo = npinfo;
 
+	/* avoid racing with NAPI reading npinfo */
+	synchronize_rcu();
+
 	return 0;
 
  release:
-- 
cgit 


From d7b9dfc8ea43936e6e8eec3040dcf4f110563868 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Thu, 11 Aug 2005 19:28:05 -0700
Subject: [NETPOLL]: remove unused variable

Remove unused variable

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 996787bca17f..a1a9a7abff50 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -357,7 +357,6 @@ static void arp_reply(struct sk_buff *skb)
 	unsigned char *arp_ptr;
 	int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
 	u32 sip, tip;
-	unsigned long flags;
 	struct sk_buff *send_skb;
 	struct netpoll *np = NULL;
 
-- 
cgit 


From 58fcb8df0bf663bb6b8f46cd3010bfe8d13d97cf Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 10 Aug 2005 18:15:12 -0400
Subject: [PATCH] NFS: Ensure ACL xdr code doesn't overflow.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/sunrpc/xdr.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 8a4d9c106af1..fde16f40a581 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -993,6 +993,7 @@ xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
 			return -EINVAL;
 	} else {
 		if (xdr_decode_word(buf, base, &desc->array_len) != 0 ||
+		    desc->array_len > desc->array_maxlen ||
 		    (unsigned long) base + 4 + desc->array_len *
 				    desc->elem_size > buf->len)
 			return -EINVAL;
-- 
cgit 


From 97077c4a9868fce8ac151512cde5d24fc1144f24 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 17 Aug 2005 12:03:32 -0700
Subject: [IPV6]: Fix raw socket hardware checksum failures

When packets hit raw sockets the csum update isn't done yet, do it manually.
Packets can also reach rawv6_rcv on the output path through
ip6_call_ra_chain, in this case skb->ip_summed is CHECKSUM_NONE and this
codepath isn't executed.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/raw.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index e2b848ec9851..1d4d75b34d32 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -328,6 +328,8 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
 
 	if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
 		if (skb->ip_summed == CHECKSUM_HW) {
+			skb_postpull_rcsum(skb, skb->nh.raw,
+			                   skb->h.raw - skb->nh.raw);
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 			if (csum_ipv6_magic(&skb->nh.ipv6h->saddr,
 					    &skb->nh.ipv6h->daddr,
-- 
cgit 


From 35d59efd105b3b7c1b5878dcc9d1749f41f9740f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 17 Aug 2005 12:03:59 -0700
Subject: [TCP]: Fix bug #5070: kernel BUG at net/ipv4/tcp_output.c:864

1) We send out a normal sized packet with TSO on to start off.
2) ICMP is received indicating a smaller MTU.
3) We send the current sk_send_head which needs to be fragmented
since it was created before the ICMP event.  The first fragment
is then sent out.

At this point the remaining fragment is allocated by tcp_fragment.
However, its size is padded to fit the L1 cache-line size therefore
creating tail-room up to 124 bytes long.

This fragment will also be sitting at sk_send_head.

4) tcp_sendmsg is called again and it stores data in the tail-room of
of the fragment.
5) tcp_push_one is called by tcp_sendmsg which then calls tso_fragment
since the packet as a whole exceeds the MTU.

At this point we have a packet that has data in the head area being
fed to tso_fragment which bombs out.

My take on this is that we shouldn't ever call tcp_fragment on a TSO
socket for a packet that is yet to be transmitted since this creates
a packet on sk_send_head that cannot be extended.

So here is a patch to change it so that tso_fragment is always used
in this case.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3ed6fc15815b..566045e58437 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -861,7 +861,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 	u16 flags;
 
 	/* All of a TSO frame must be composed of paged data.  */
-	BUG_ON(skb->len != skb->data_len);
+	if (skb->len != skb->data_len)
+		return tcp_fragment(sk, skb, len, mss_now);
 
 	buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
 	if (unlikely(buff == NULL))
@@ -974,6 +975,8 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 
 	sent_pkts = 0;
 	while ((skb = sk->sk_send_head)) {
+		unsigned int limit;
+
 		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 		BUG_ON(!tso_segs);
 
@@ -994,9 +997,10 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 				break;
 		}
 
+		limit = mss_now;
 		if (tso_segs > 1) {
-			u32 limit = tcp_window_allows(tp, skb,
-						      mss_now, cwnd_quota);
+			limit = tcp_window_allows(tp, skb,
+						  mss_now, cwnd_quota);
 
 			if (skb->len < limit) {
 				unsigned int trim = skb->len % mss_now;
@@ -1004,15 +1008,12 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 				if (trim)
 					limit = skb->len - trim;
 			}
-			if (skb->len > limit) {
-				if (tso_fragment(sk, skb, limit, mss_now))
-					break;
-			}
-		} else if (unlikely(skb->len > mss_now)) {
-			if (unlikely(tcp_fragment(sk, skb,  mss_now, mss_now)))
-				break;
 		}
 
+		if (skb->len > limit &&
+		    unlikely(tso_fragment(sk, skb, limit, mss_now)))
+			break;
+
 		TCP_SKB_CB(skb)->when = tcp_time_stamp;
 
 		if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
@@ -1064,11 +1065,14 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
 	cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
 
 	if (likely(cwnd_quota)) {
+		unsigned int limit;
+
 		BUG_ON(!tso_segs);
 
+		limit = mss_now;
 		if (tso_segs > 1) {
-			u32 limit = tcp_window_allows(tp, skb,
-						      mss_now, cwnd_quota);
+			limit = tcp_window_allows(tp, skb,
+						  mss_now, cwnd_quota);
 
 			if (skb->len < limit) {
 				unsigned int trim = skb->len % mss_now;
@@ -1076,15 +1080,12 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
 				if (trim)
 					limit = skb->len - trim;
 			}
-			if (skb->len > limit) {
-				if (unlikely(tso_fragment(sk, skb, limit, mss_now)))
-					return;
-			}
-		} else if (unlikely(skb->len > mss_now)) {
-			if (unlikely(tcp_fragment(sk, skb, mss_now, mss_now)))
-				return;
 		}
 
+		if (skb->len > limit &&
+		    unlikely(tso_fragment(sk, skb, limit, mss_now)))
+			return;
+
 		/* Send it out now. */
 		TCP_SKB_CB(skb)->when = tcp_time_stamp;
 
-- 
cgit 


From bfd272b1ca1164382eabaa9986aad822adb91eb2 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 17 Aug 2005 12:04:22 -0700
Subject: [IPV6]: Fix SKB leak in ip6_input_finish()

Changing it to how ip_input handles should fix it.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_input.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 866f10726c58..10fbb50daea4 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -198,12 +198,13 @@ resubmit:
 		if (!raw_sk) {
 			if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
 				IP6_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
-				icmpv6_param_prob(skb, ICMPV6_UNK_NEXTHDR, nhoff);
+				icmpv6_send(skb, ICMPV6_PARAMPROB,
+				            ICMPV6_UNK_NEXTHDR, nhoff,
+				            skb->dev);
 			}
-		} else {
+		} else
 			IP6_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
-			kfree_skb(skb);
-		}
+		kfree_skb(skb);
 	}
 	rcu_read_unlock();
 	return 0;
-- 
cgit 


From 1f07247de51efd30c88ad8e3e06a8b5382fc7d35 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Wed, 17 Aug 2005 12:05:27 -0700
Subject: [DECNET]: Fix RCU race condition in dn_neigh_construct().

Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/decnet/dn_neigh.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index f32dba9e26fe..8d0cc3cf3e49 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -148,12 +148,12 @@ static int dn_neigh_construct(struct neighbour *neigh)
 
 	__neigh_parms_put(neigh->parms);
 	neigh->parms = neigh_parms_clone(parms);
-	rcu_read_unlock();
 
 	if (dn_db->use_long)
 		neigh->ops = &dn_long_ops;
 	else
 		neigh->ops = &dn_short_ops;
+	rcu_read_unlock();
 
 	if (dn->flags & DN_NDFLAG_P3)
 		neigh->ops = &dn_phase3_ops;
-- 
cgit 


From 001dd250c1c68667a5c3b74979fa614e2edc9ceb Mon Sep 17 00:00:00 2001
From: Jay Vosburgh <fubar@us.ibm.com>
Date: Thu, 18 Aug 2005 14:04:51 -0700
Subject: [TOKENRING]: Use interrupt-safe locking with rif_lock.

Change operations on rif_lock from spin_{un}lock_bh to
spin_{un}lock_irq{save,restore} equivalents.  Some of the
rif_lock critical sections are called from interrupt context via
tr_type_trans->tr_add_rif_info.  The TR NIC drivers call tr_type_trans
from their packet receive handlers.

Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/802/tr.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/802/tr.c b/net/802/tr.c
index a755e880f4ba..1bb7dc1b85cd 100644
--- a/net/802/tr.c
+++ b/net/802/tr.c
@@ -251,10 +251,11 @@ void tr_source_route(struct sk_buff *skb,struct trh_hdr *trh,struct net_device *
 	unsigned int hash;
 	struct rif_cache *entry;
 	unsigned char *olddata;
+	unsigned long flags;
 	static const unsigned char mcast_func_addr[] 
 		= {0xC0,0x00,0x00,0x04,0x00,0x00};
 	
-	spin_lock_bh(&rif_lock);
+	spin_lock_irqsave(&rif_lock, flags);
 
 	/*
 	 *	Broadcasts are single route as stated in RFC 1042 
@@ -323,7 +324,7 @@ printk("source routing for %02X:%02X:%02X:%02X:%02X:%02X\n",trh->daddr[0],
 	else 
 		slack = 18 - ((ntohs(trh->rcf) & TR_RCF_LEN_MASK)>>8);
 	olddata = skb->data;
-	spin_unlock_bh(&rif_lock);
+	spin_unlock_irqrestore(&rif_lock, flags);
 
 	skb_pull(skb, slack);
 	memmove(skb->data, olddata, sizeof(struct trh_hdr) - slack);
@@ -337,10 +338,11 @@ printk("source routing for %02X:%02X:%02X:%02X:%02X:%02X\n",trh->daddr[0],
 static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev)
 {
 	unsigned int hash, rii_p = 0;
+	unsigned long flags;
 	struct rif_cache *entry;
 
 
-	spin_lock_bh(&rif_lock);
+	spin_lock_irqsave(&rif_lock, flags);
 	
 	/*
 	 *	Firstly see if the entry exists
@@ -378,7 +380,7 @@ printk("adding rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
 		if(!entry) 
 		{
 			printk(KERN_DEBUG "tr.c: Couldn't malloc rif cache entry !\n");
-			spin_unlock_bh(&rif_lock);
+			spin_unlock_irqrestore(&rif_lock, flags);
 			return;
 		}
 
@@ -420,7 +422,7 @@ printk("updating rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
 		    }                                         
            	entry->last_used=jiffies;               
 	}
-	spin_unlock_bh(&rif_lock);
+	spin_unlock_irqrestore(&rif_lock, flags);
 }
 
 /*
@@ -430,9 +432,9 @@ printk("updating rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
 static void rif_check_expire(unsigned long dummy) 
 {
 	int i;
-	unsigned long next_interval = jiffies + sysctl_tr_rif_timeout/2;
+	unsigned long flags, next_interval = jiffies + sysctl_tr_rif_timeout/2;
 
-	spin_lock_bh(&rif_lock);
+	spin_lock_irqsave(&rif_lock, flags);
 	
 	for(i =0; i < RIF_TABLE_SIZE; i++) {
 		struct rif_cache *entry, **pentry;
@@ -454,7 +456,7 @@ static void rif_check_expire(unsigned long dummy)
 		}
 	}
 	
-	spin_unlock_bh(&rif_lock);
+	spin_unlock_irqrestore(&rif_lock, flags);
 
 	mod_timer(&rif_timer, next_interval);
 
@@ -485,7 +487,7 @@ static struct rif_cache *rif_get_idx(loff_t pos)
 
 static void *rif_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	spin_lock_bh(&rif_lock);
+	spin_lock_irq(&rif_lock);
 
 	return *pos ? rif_get_idx(*pos - 1) : SEQ_START_TOKEN;
 }
@@ -516,7 +518,7 @@ static void *rif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 static void rif_seq_stop(struct seq_file *seq, void *v)
 {
-	spin_unlock_bh(&rif_lock);
+	spin_unlock_irq(&rif_lock);
 }
 
 static int rif_seq_show(struct seq_file *seq, void *v)
-- 
cgit 


From cb94c62c252796f42bb83fe40960d12f3ea5a82a Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 18 Aug 2005 14:05:44 -0700
Subject: [IPV4]: Fix DST leak in icmp_push_reply()

Based upon a bug report and initial patch by
Ollie Wild.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 3d78464f64ea..badfc5849973 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -349,12 +349,12 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
 {
 	struct sk_buff *skb;
 
-	ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
-		       icmp_param->data_len+icmp_param->head_len,
-		       icmp_param->head_len,
-		       ipc, rt, MSG_DONTWAIT);
-
-	if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
+	if (ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
+		           icmp_param->data_len+icmp_param->head_len,
+		           icmp_param->head_len,
+		           ipc, rt, MSG_DONTWAIT) < 0)
+		ip_flush_pending_frames(icmp_socket->sk);
+	else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
 		struct icmphdr *icmph = skb->h.icmph;
 		unsigned int csum = 0;
 		struct sk_buff *skb1;
-- 
cgit 


From 6fc8b9e7c60d4a3d4d7f1189f74e37651f5610e6 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 18 Aug 2005 14:36:59 -0700
Subject: [IPCOMP]: Fix false smp_processor_id warning

This patch fixes a false-positive from debug_smp_processor_id().

The processor ID is only used to look up crypto_tfm objects.
Any processor ID is acceptable here as long as it is one that is
iterated on by for_each_cpu().

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipcomp.c  | 2 +-
 net/ipv6/ipcomp6.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 2065944fd9e5..7ded6e60f43a 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -358,7 +358,7 @@ static struct crypto_tfm **ipcomp_alloc_tfms(const char *alg_name)
 	int cpu;
 
 	/* This can be any valid CPU ID so we don't need locking. */
-	cpu = smp_processor_id();
+	cpu = raw_smp_processor_id();
 
 	list_for_each_entry(pos, &ipcomp_tfms_list, list) {
 		struct crypto_tfm *tfm;
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 423feb46ccc0..135383ef538f 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -354,7 +354,7 @@ static struct crypto_tfm **ipcomp6_alloc_tfms(const char *alg_name)
 	int cpu;
 
 	/* This can be any valid CPU ID so we don't need locking. */
-	cpu = smp_processor_id();
+	cpu = raw_smp_processor_id();
 
 	list_for_each_entry(pos, &ipcomp6_tfms_list, list) {
 		struct crypto_tfm *tfm;
-- 
cgit 


From fd841326d73096ad79be9c3fa348f9ad04541cc2 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sat, 20 Aug 2005 17:38:40 -0700
Subject: [NETFILTER]: Fix ECN target TCP marking

An incorrect check made it bail out before doing anything.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ipt_ECN.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index ada9911118e9..d3250a3a5db4 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -61,10 +61,10 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
 	if (!tcph)
 		return 0;
 
-	if (!(einfo->operation & IPT_ECN_OP_SET_ECE
-	      || tcph->ece == einfo->proto.tcp.ece)
-	    && (!(einfo->operation & IPT_ECN_OP_SET_CWR
-		  || tcph->cwr == einfo->proto.tcp.cwr)))
+	if ((!(einfo->operation & IPT_ECN_OP_SET_ECE) ||
+	     tcph->ece == einfo->proto.tcp.ece) &&
+	    ((!(einfo->operation & IPT_ECN_OP_SET_CWR) ||
+	     tcph->cwr == einfo->proto.tcp.cwr)))
 		return 1;
 
 	if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
-- 
cgit 


From f93592ff4fa4a55aa7640d435fa93338e190294d Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sat, 20 Aug 2005 17:39:15 -0700
Subject: [NETFILTER]: Fix HW checksum handling in ECN target

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ipt_ECN.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index d3250a3a5db4..94a0ce1c1c9d 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -71,6 +71,10 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
 		return 0;
 	tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
 
+	if ((*pskb)->ip_summed == CHECKSUM_HW &&
+	    skb_checksum_help(*pskb, inward))
+		return 0;
+
 	diffs[0] = ((u_int16_t *)tcph)[6];
 	if (einfo->operation & IPT_ECN_OP_SET_ECE)
 		tcph->ece = einfo->proto.tcp.ece;
@@ -79,13 +83,10 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
 	diffs[1] = ((u_int16_t *)tcph)[6];
 	diffs[0] = diffs[0] ^ 0xFFFF;
 
-	if ((*pskb)->ip_summed != CHECKSUM_HW)
+	if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY)
 		tcph->check = csum_fold(csum_partial((char *)diffs,
 						     sizeof(diffs),
 						     tcph->check^0xFFFF));
-	else
-		if (skb_checksum_help(*pskb, inward))
-			return 0;
 	(*pskb)->nfcache |= NFC_ALTERED;
 	return 1;
 }
-- 
cgit 


From 7e71af49d46e4c25f17a2c8f53d62ffd14f01007 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sat, 20 Aug 2005 17:40:41 -0700
Subject: [NETFILTER]: Fix HW checksum handling in TCPMSS target

Most importantly, remove bogus BUG() in receive path.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ipt_TCPMSS.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
index 1049050b2bfb..7b84a254440e 100644
--- a/net/ipv4/netfilter/ipt_TCPMSS.c
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -61,6 +61,10 @@ ipt_tcpmss_target(struct sk_buff **pskb,
 	if (!skb_ip_make_writable(pskb, (*pskb)->len))
 		return NF_DROP;
 
+	if ((*pskb)->ip_summed == CHECKSUM_HW &&
+	    skb_checksum_help(*pskb, out == NULL))
+		return NF_DROP;
+
 	iph = (*pskb)->nh.iph;
 	tcplen = (*pskb)->len - iph->ihl*4;
 
@@ -186,9 +190,6 @@ ipt_tcpmss_target(struct sk_buff **pskb,
 	       newmss);
 
  retmodified:
-	/* We never hw checksum SYN packets.  */
-	BUG_ON((*pskb)->ip_summed == CHECKSUM_HW);
-
 	(*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
 	return IPT_CONTINUE;
 }
-- 
cgit 


From 14869c388673e8db3348ab3706fa6485d0f0cf95 Mon Sep 17 00:00:00 2001
From: Dmitry Yusupov <dima@neterion.com>
Date: Tue, 23 Aug 2005 10:09:27 -0700
Subject: [TCP]: Do TSO deferral even if tail SKB can go out now.

If the tail SKB fits into the window, it is still
benefitical to defer until the goal percentage of
the window is available.  This give the application
time to feed more data into the send queue and thus
results in larger TSO frames going out.

Patch from Dmitry Yusupov <dima@neterion.com>.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 566045e58437..dd30dd137b74 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -925,10 +925,6 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_
 
 	limit = min(send_win, cong_win);
 
-	/* If sk_send_head can be sent fully now, just do it.  */
-	if (skb->len <= limit)
-		return 0;
-
 	if (sysctl_tcp_tso_win_divisor) {
 		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
 
-- 
cgit 


From c3a20692ca5c8eb8cf5d0f489d4fc839ce7593d1 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 23 Aug 2005 10:09:53 -0700
Subject: [RPC]: Kill bogus kmap in krb5

While I was going through the crypto users recently, I noticed this
bogus kmap in sunrpc.  It's totally unnecessary since the crypto
layer will do its own kmap before touching the data.  Besides, the
kmap is throwing the return value away.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sunrpc/auth_gss/gss_krb5_crypto.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 24c21f2a33a7..5a7265aeaf83 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -185,9 +185,7 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
 			sg->page = body->pages[i];
 			sg->offset = offset;
 			sg->length = thislen;
-			kmap(sg->page); /* XXX kmap_atomic? */
 			crypto_digest_update(tfm, sg, 1);
-			kunmap(sg->page);
 			len -= thislen;
 			i++;
 			offset = 0;
-- 
cgit 


From 1344a41637114485fac7afa1505bce2ff862807a Mon Sep 17 00:00:00 2001
From: Dave Johnson <djohnson+linux-kernel@sw.starentnetworks.com>
Date: Tue, 23 Aug 2005 10:10:15 -0700
Subject: [IPV4]: Fix negative timer loop with lots of ipv4 peers.

From: Dave Johnson <djohnson+linux-kernel@sw.starentnetworks.com>

Found this bug while doing some scaling testing that created 500K inet
peers.

peer_check_expire() in net/ipv4/inetpeer.c isn't using inet_peer_gc_mintime
correctly and will end up creating an expire timer with less than the
minimum duration, and even zero/negative if enough active peers are
present.

If >65K peers, the timer will be less than inet_peer_gc_mintime, and with
>70K peers, the timer duration will reach zero and go negative.

The timer handler will continue to schedule another zero/negative timer in
a loop until peers can be aged.  This can continue for at least a few
minutes or even longer if the peers remain active due to arriving packets
while the loop is occurring.

Bug is present in both 2.4 and 2.6.  Same patch will apply to both just
fine.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inetpeer.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 95473953c406..ab18a853d7ce 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -450,10 +450,13 @@ static void peer_check_expire(unsigned long dummy)
 	/* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
 	 * interval depending on the total number of entries (more entries,
 	 * less interval). */
-	peer_periodic_timer.expires = jiffies
-		+ inet_peer_gc_maxtime
-		- (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
-			peer_total / inet_peer_threshold * HZ;
+	if (peer_total >= inet_peer_threshold)
+		peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
+	else
+		peer_periodic_timer.expires = jiffies
+			+ inet_peer_gc_maxtime
+			- (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
+				peer_total / inet_peer_threshold * HZ;
 	add_timer(&peer_periodic_timer);
 }
 
-- 
cgit 


From 66a79a19a7c582efd99bb143c3a59fbda006eb39 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 23 Aug 2005 10:10:35 -0700
Subject: [NETFILTER]: Fix HW checksum handling in ip_queue/ip6_queue

The checksum needs to be filled in on output, after mangling a packet
ip_summed needs to be reset.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_queue.c  | 7 +++++++
 net/ipv6/netfilter/ip6_queue.c | 7 +++++++
 2 files changed, 14 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index eda1fba431a4..c6baa8174389 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -214,6 +214,12 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
 		break;
 	
 	case IPQ_COPY_PACKET:
+		if (entry->skb->ip_summed == CHECKSUM_HW &&
+		    (*errp = skb_checksum_help(entry->skb,
+		                               entry->info->outdev == NULL))) {
+			read_unlock_bh(&queue_lock);
+			return NULL;
+		}
 		if (copy_range == 0 || copy_range > entry->skb->len)
 			data_len = entry->skb->len;
 		else
@@ -385,6 +391,7 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
 	if (!skb_ip_make_writable(&e->skb, v->data_len))
 		return -ENOMEM;
 	memcpy(e->skb->data, v->payload, v->data_len);
+	e->skb->ip_summed = CHECKSUM_NONE;
 	e->skb->nfcache |= NFC_ALTERED;
 
 	/*
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index 5493180f0d44..a16df5b27c84 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -211,6 +211,12 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
 		break;
 	
 	case IPQ_COPY_PACKET:
+		if (entry->skb->ip_summed == CHECKSUM_HW &&
+		    (*errp = skb_checksum_help(entry->skb,
+		                               entry->info->outdev == NULL))) {
+			read_unlock_bh(&queue_lock);
+			return NULL;
+		}
 		if (copy_range == 0 || copy_range > entry->skb->len)
 			data_len = entry->skb->len;
 		else
@@ -381,6 +387,7 @@ ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
 	if (!skb_ip_make_writable(&e->skb, v->data_len))
 		return -ENOMEM;
 	memcpy(e->skb->data, v->payload, v->data_len);
+	e->skb->ip_summed = CHECKSUM_NONE;
 	e->skb->nfcache |= NFC_ALTERED;
 
 	/*
-- 
cgit 


From 53b924b31fa53ac3007df3fef6870d5074a9adf8 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Tue, 23 Aug 2005 10:11:30 -0700
Subject: [NET]: Fix socket bitop damage

The socket flag cleanups that went into 2.6.12-rc1 are basically oring
the flags of an old socket into the socket just being created.
Unfortunately that one was just initialized by sock_init_data(), so already
has SOCK_ZAPPED set.  As the result zapped sockets are created and all
incoming connection will fail due to this bug which again was carefully
replicated to at least AX.25, NET/ROM or ROSE.

In order to keep the abstraction alive I've introduced sock_copy_flags()
to copy the socket flags from one sockets to another and used that
instead of the bitwise copy thing.  Anyway, the idea here has probably
been to copy all flags, so sock_copy_flags() should be the right thing.
With this the ham radio protocols are usable again, so I hope this will
make it into 2.6.13.

Signed-off-by: Ralf Baechle DL5RB <ralf@linux-mips.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ax25/af_ax25.c     | 7 +------
 net/netrom/af_netrom.c | 7 +------
 net/rose/af_rose.c     | 7 +------
 3 files changed, 3 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 707097deac3d..7d8ecadba668 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -875,12 +875,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
 	sk->sk_sndbuf   = osk->sk_sndbuf;
 	sk->sk_state    = TCP_ESTABLISHED;
 	sk->sk_sleep    = osk->sk_sleep;
-
-	if (sock_flag(osk, SOCK_DBG))
-		sock_set_flag(sk, SOCK_DBG);
-
-	if (sock_flag(osk, SOCK_ZAPPED))
-		sock_set_flag(sk, SOCK_ZAPPED);
+	sock_copy_flags(sk, osk);
 
 	oax25 = ax25_sk(osk);
 
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 31ed4a9a1d06..5385835e9267 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -459,12 +459,7 @@ static struct sock *nr_make_new(struct sock *osk)
 	sk->sk_sndbuf   = osk->sk_sndbuf;
 	sk->sk_state    = TCP_ESTABLISHED;
 	sk->sk_sleep    = osk->sk_sleep;
-
-	if (sock_flag(osk, SOCK_ZAPPED))
-		sock_set_flag(sk, SOCK_ZAPPED);
-
-	if (sock_flag(osk, SOCK_DBG))
-		sock_set_flag(sk, SOCK_DBG);
+	sock_copy_flags(sk, osk);
 
 	skb_queue_head_init(&nr->ack_queue);
 	skb_queue_head_init(&nr->reseq_queue);
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 7eb6a5bf93ea..3fe7e562125a 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -556,12 +556,7 @@ static struct sock *rose_make_new(struct sock *osk)
 	sk->sk_sndbuf   = osk->sk_sndbuf;
 	sk->sk_state    = TCP_ESTABLISHED;
 	sk->sk_sleep    = osk->sk_sleep;
-
-	if (sock_flag(osk, SOCK_ZAPPED))
-		sock_set_flag(sk, SOCK_ZAPPED);
-
-	if (sock_flag(osk, SOCK_DBG))
-		sock_set_flag(sk, SOCK_DBG);
+	sock_copy_flags(sk, osk);
 
 	init_timer(&rose->timer);
 	init_timer(&rose->idletimer);
-- 
cgit 


From 01d7dd0e9f8c5f1888619d2649c7da389232b408 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Tue, 23 Aug 2005 10:11:45 -0700
Subject: [AX25]: UID fixes

 o Brown paperbag bug - ax25_findbyuid() was always returning a NULL pointer
   as the result.  Breaks ROSE completly and AX.25 if UID policy set to deny.

 o While the list structure of AX.25's UID to callsign mapping table was
   properly protected by a spinlock, it's elements were not refcounted
   resulting in a race between removal and usage of an element.

Signed-off-by: Ralf Baechle DL5RB <ralf@linux-mips.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ax25/af_ax25.c     | 20 +++++++-----
 net/ax25/ax25_route.c  | 12 +++++---
 net/ax25/ax25_uid.c    | 83 ++++++++++++++++++++++----------------------------
 net/netrom/af_netrom.c | 24 ++++++++++-----
 net/rose/af_rose.c     | 20 +++++++-----
 5 files changed, 84 insertions(+), 75 deletions(-)

(limited to 'net')

diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 7d8ecadba668..a5c94f11547c 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1002,7 +1002,8 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	struct sock *sk = sock->sk;
 	struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr;
 	ax25_dev *ax25_dev = NULL;
-	ax25_address *call;
+	ax25_uid_assoc *user;
+	ax25_address call;
 	ax25_cb *ax25;
 	int err = 0;
 
@@ -1021,9 +1022,15 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (addr->fsa_ax25.sax25_family != AF_AX25)
 		return -EINVAL;
 
-	call = ax25_findbyuid(current->euid);
-	if (call == NULL && ax25_uid_policy && !capable(CAP_NET_ADMIN)) {
-		return -EACCES;
+	user = ax25_findbyuid(current->euid);
+	if (user) {
+		call = user->call;
+		ax25_uid_put(user);
+	} else {
+		if (ax25_uid_policy && !capable(CAP_NET_ADMIN))
+			return -EACCES;
+
+		call = addr->fsa_ax25.sax25_call;
 	}
 
 	lock_sock(sk);
@@ -1034,10 +1041,7 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		goto out;
 	}
 
-	if (call == NULL)
-		ax25->source_addr = addr->fsa_ax25.sax25_call;
-	else
-		ax25->source_addr = *call;
+	ax25->source_addr = call;
 
 	/*
 	 * User already set interface with SO_BINDTODEVICE
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index 44b99b1ff9f8..c288526da4ce 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -422,8 +422,8 @@ static inline void ax25_adjust_path(ax25_address *addr, ax25_digi *digipeat)
  */
 int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
 {
+	ax25_uid_assoc *user;
 	ax25_route *ax25_rt;
-	ax25_address *call;
 	int err;
 
 	if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL)
@@ -434,16 +434,18 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
 		goto put;
 	}
 
-	if ((call = ax25_findbyuid(current->euid)) == NULL) {
+	user = ax25_findbyuid(current->euid);
+	if (user) {
+		ax25->source_addr = user->call;
+		ax25_uid_put(user);
+	} else {
 		if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
 			err = -EPERM;
 			goto put;
 		}
-		call = (ax25_address *)ax25->ax25_dev->dev->dev_addr;
+		ax25->source_addr = *(ax25_address *)ax25->ax25_dev->dev->dev_addr;
 	}
 
-	ax25->source_addr = *call;
-
 	if (ax25_rt->digipeat != NULL) {
 		if ((ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) {
 			err = -ENOMEM;
diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c
index cea6b7d19729..a8b3822f3ee4 100644
--- a/net/ax25/ax25_uid.c
+++ b/net/ax25/ax25_uid.c
@@ -28,6 +28,7 @@
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
+#include <linux/list.h>
 #include <linux/notifier.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -41,38 +42,41 @@
  *	Callsign/UID mapper. This is in kernel space for security on multi-amateur machines.
  */
 
-static ax25_uid_assoc *ax25_uid_list;
+HLIST_HEAD(ax25_uid_list);
 static DEFINE_RWLOCK(ax25_uid_lock);
 
 int ax25_uid_policy = 0;
 
-ax25_address *ax25_findbyuid(uid_t uid)
+ax25_uid_assoc *ax25_findbyuid(uid_t uid)
 {
-	ax25_uid_assoc *ax25_uid;
-	ax25_address *res = NULL;
+	ax25_uid_assoc *ax25_uid, *res = NULL;
+	struct hlist_node *node;
 
 	read_lock(&ax25_uid_lock);
-	for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) {
+	ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
 		if (ax25_uid->uid == uid) {
-			res = &ax25_uid->call;
+			ax25_uid_hold(ax25_uid);
+			res = ax25_uid;
 			break;
 		}
 	}
 	read_unlock(&ax25_uid_lock);
 
-	return NULL;
+	return res;
 }
 
 int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
 {
-	ax25_uid_assoc *s, *ax25_uid;
+	ax25_uid_assoc *ax25_uid;
+	struct hlist_node *node;
+	ax25_uid_assoc *user;
 	unsigned long res;
 
 	switch (cmd) {
 	case SIOCAX25GETUID:
 		res = -ENOENT;
 		read_lock(&ax25_uid_lock);
-		for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) {
+		ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
 			if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) {
 				res = ax25_uid->uid;
 				break;
@@ -85,19 +89,22 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
 	case SIOCAX25ADDUID:
 		if (!capable(CAP_NET_ADMIN))
 			return -EPERM;
-		if (ax25_findbyuid(sax->sax25_uid))
+		user = ax25_findbyuid(sax->sax25_uid);
+		if (user) {
+			ax25_uid_put(user);
 			return -EEXIST;
+		}
 		if (sax->sax25_uid == 0)
 			return -EINVAL;
 		if ((ax25_uid = kmalloc(sizeof(*ax25_uid), GFP_KERNEL)) == NULL)
 			return -ENOMEM;
 
+		atomic_set(&ax25_uid->refcount, 1);
 		ax25_uid->uid  = sax->sax25_uid;
 		ax25_uid->call = sax->sax25_call;
 
 		write_lock(&ax25_uid_lock);
-		ax25_uid->next = ax25_uid_list;
-		ax25_uid_list  = ax25_uid;
+		hlist_add_head(&ax25_uid->uid_node, &ax25_uid_list);
 		write_unlock(&ax25_uid_lock);
 
 		return 0;
@@ -106,34 +113,21 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
 		if (!capable(CAP_NET_ADMIN))
 			return -EPERM;
 
+		ax25_uid = NULL;
 		write_lock(&ax25_uid_lock);
-		for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) {
-			if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) {
+		ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
+			if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0)
 				break;
-			}
 		}
 		if (ax25_uid == NULL) {
 			write_unlock(&ax25_uid_lock);
 			return -ENOENT;
 		}
-		if ((s = ax25_uid_list) == ax25_uid) {
-			ax25_uid_list = s->next;
-			write_unlock(&ax25_uid_lock);
-			kfree(ax25_uid);
-			return 0;
-		}
-		while (s != NULL && s->next != NULL) {
-			if (s->next == ax25_uid) {
-				s->next = ax25_uid->next;
-				write_unlock(&ax25_uid_lock);
-				kfree(ax25_uid);
-				return 0;
-			}
-			s = s->next;
-		}
+		hlist_del_init(&ax25_uid->uid_node);
+		ax25_uid_put(ax25_uid);
 		write_unlock(&ax25_uid_lock);
 
-		return -ENOENT;
+		return 0;
 
 	default:
 		return -EINVAL;
@@ -147,13 +141,11 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
 static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct ax25_uid_assoc *pt;
-	int i = 1;
+	struct hlist_node *node;
+	int i = 0;
 
 	read_lock(&ax25_uid_lock);
-	if (*pos == 0)
-		return SEQ_START_TOKEN;
-
-	for (pt = ax25_uid_list; pt != NULL; pt = pt->next) {
+	ax25_uid_for_each(pt, node, &ax25_uid_list) {
 		if (i == *pos)
 			return pt;
 		++i;
@@ -164,8 +156,9 @@ static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos)
 static void *ax25_uid_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	++*pos;
-	return (v == SEQ_START_TOKEN) ? ax25_uid_list : 
-		((struct ax25_uid_assoc *) v)->next;
+
+	return hlist_entry(((ax25_uid_assoc *)v)->uid_node.next,
+	                   ax25_uid_assoc, uid_node);
 }
 
 static void ax25_uid_seq_stop(struct seq_file *seq, void *v)
@@ -179,7 +172,6 @@ static int ax25_uid_seq_show(struct seq_file *seq, void *v)
 		seq_printf(seq, "Policy: %d\n", ax25_uid_policy);
 	else {
 		struct ax25_uid_assoc *pt = v;
-		
 
 		seq_printf(seq, "%6d %s\n", pt->uid, ax2asc(&pt->call));
 	}
@@ -213,16 +205,13 @@ struct file_operations ax25_uid_fops = {
  */
 void __exit ax25_uid_free(void)
 {
-	ax25_uid_assoc *s, *ax25_uid;
+	ax25_uid_assoc *ax25_uid;
+	struct hlist_node *node;
 
 	write_lock(&ax25_uid_lock);
-	ax25_uid = ax25_uid_list;
-	while (ax25_uid != NULL) {
-		s        = ax25_uid;
-		ax25_uid = ax25_uid->next;
-
-		kfree(s);
+	ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
+		hlist_del_init(&ax25_uid->uid_node);
+		ax25_uid_put(ax25_uid);
 	}
-	ax25_uid_list = NULL;
 	write_unlock(&ax25_uid_lock);
 }
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 5385835e9267..162a85fed150 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -536,7 +536,8 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	struct nr_sock *nr = nr_sk(sk);
 	struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr;
 	struct net_device *dev;
-	ax25_address *user, *source;
+	ax25_uid_assoc *user;
+	ax25_address *source;
 
 	lock_sock(sk);
 	if (!sock_flag(sk, SOCK_ZAPPED)) {
@@ -575,16 +576,19 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	} else {
 		source = &addr->fsa_ax25.sax25_call;
 
-		if ((user = ax25_findbyuid(current->euid)) == NULL) {
+		user = ax25_findbyuid(current->euid);
+		if (user) {
+			nr->user_addr   = user->call;
+			ax25_uid_put(user);
+		} else {
 			if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
 				release_sock(sk);
 				dev_put(dev);
 				return -EPERM;
 			}
-			user = source;
+			nr->user_addr   = *source;
 		}
 
-		nr->user_addr   = *user;
 		nr->source_addr = *source;
 	}
 
@@ -604,7 +608,8 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr,
 	struct sock *sk = sock->sk;
 	struct nr_sock *nr = nr_sk(sk);
 	struct sockaddr_ax25 *addr = (struct sockaddr_ax25 *)uaddr;
-	ax25_address *user, *source = NULL;
+	ax25_address *source = NULL;
+	ax25_uid_assoc *user;
 	struct net_device *dev;
 
 	lock_sock(sk);
@@ -645,16 +650,19 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr,
 		}
 		source = (ax25_address *)dev->dev_addr;
 
-		if ((user = ax25_findbyuid(current->euid)) == NULL) {
+		user = ax25_findbyuid(current->euid);
+		if (user) {
+			nr->user_addr   = user->call;
+			ax25_uid_put(user);
+		} else {
 			if (ax25_uid_policy && !capable(CAP_NET_ADMIN)) {
 				dev_put(dev);
 				release_sock(sk);
 				return -EPERM;
 			}
-			user = source;
+			nr->user_addr   = *source;
 		}
 
-		nr->user_addr   = *user;
 		nr->source_addr = *source;
 		nr->device      = dev;
 
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 3fe7e562125a..5480caf8ccc2 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -626,7 +626,8 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	struct rose_sock *rose = rose_sk(sk);
 	struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr;
 	struct net_device *dev;
-	ax25_address *user, *source;
+	ax25_address *source;
+	ax25_uid_assoc *user;
 	int n;
 
 	if (!sock_flag(sk, SOCK_ZAPPED))
@@ -651,14 +652,17 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 	source = &addr->srose_call;
 
-	if ((user = ax25_findbyuid(current->euid)) == NULL) {
+	user = ax25_findbyuid(current->euid);
+	if (user) {
+		rose->source_call = user->call;
+		ax25_uid_put(user);
+	} else {
 		if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE))
 			return -EACCES;
-		user = source;
+		rose->source_call   = *source;
 	}
 
 	rose->source_addr   = addr->srose_addr;
-	rose->source_call   = *user;
 	rose->device        = dev;
 	rose->source_ndigis = addr->srose_ndigis;
 
@@ -685,8 +689,8 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
 	struct rose_sock *rose = rose_sk(sk);
 	struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr;
 	unsigned char cause, diagnostic;
-	ax25_address *user;
 	struct net_device *dev;
+	ax25_uid_assoc *user;
 	int n;
 
 	if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) {
@@ -736,12 +740,14 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
 		if ((dev = rose_dev_first()) == NULL)
 			return -ENETUNREACH;
 
-		if ((user = ax25_findbyuid(current->euid)) == NULL)
+		user = ax25_findbyuid(current->euid);
+		if (!user)
 			return -EINVAL;
 
 		memcpy(&rose->source_addr, dev->dev_addr, ROSE_ADDR_LEN);
-		rose->source_call = *user;
+		rose->source_call = user->call;
 		rose->device      = dev;
+		ax25_uid_put(user);
 
 		rose_insert_socket(sk);		/* Finish the bind */
 	}
-- 
cgit 


From d2287f844187158e5eddd0d5de8e95bd607abcb7 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Tue, 23 Aug 2005 10:12:04 -0700
Subject: [SCTP]: Add SENTINEL to SCTP MIB stats

Add SNMP_MIB_SENTINEL to the definition of the sctp_snmp_list so that
the output routine in proc correctly terminates.  This was causing some
problems running on ia64 systems.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/proc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 98d49ec9b74b..b74f7772b576 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -57,6 +57,7 @@ static struct snmp_mib sctp_snmp_list[] = {
 	SNMP_MIB_ITEM("SctpReasmUsrMsgs", SCTP_MIB_REASMUSRMSGS),
 	SNMP_MIB_ITEM("SctpOutSCTPPacks", SCTP_MIB_OUTSCTPPACKS),
 	SNMP_MIB_ITEM("SctpInSCTPPacks", SCTP_MIB_INSCTPPACKS),
+	SNMP_MIB_SENTINEL
 };
 
 /* Return the current value of a particular entry in the mib by adding its
-- 
cgit 


From 0fbbeb1ba43bd04f0f1d4f161b7f72437a1c8a03 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 23 Aug 2005 10:12:44 -0700
Subject: [PKT_SCHED]: Fix missing qdisc_destroy() in qdisc_create_dflt()

qdisc_create_dflt() is missing to destroy the newly allocated
default qdisc if the initialization fails resulting in leaks
of all kinds. The only caller in mainline which may trigger
this bug is sch_tbf.c in tbf_create_dflt_qdisc().

Note: qdisc_create_dflt() doesn't fulfill the official locking
      requirements of qdisc_destroy() but since the qdisc could
      never be seen by the outside world this doesn't matter
      and it can stay as-is until the locking of pkt_sched
      is cleaned up.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 8edefd5d095d..0d066c965342 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -438,6 +438,7 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
 	if (!ops->init || ops->init(sch, NULL) == 0)
 		return sch;
 
+	qdisc_destroy(sch);
 errout:
 	return NULL;
 }
-- 
cgit 


From 89ebd197eb2cd31d6187db344d5117064e19fdde Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 23 Aug 2005 10:13:06 -0700
Subject: [TCP]: Unconditionally clear TCP_NAGLE_PUSH in skb_entail().

Intention of this bit is to force pushing of the existing
send queue when TCP_CORK or TCP_NODELAY state changes via
setsockopt().

But it's easy to create a situation where the bit never
clears.  For example, if the send queue starts empty:

1) set TCP_NODELAY
2) clear TCP_NODELAY
3) set TCP_CORK
4) do small write()

The current code will leave TCP_NAGLE_PUSH set after that
sequence.  Unconditionally clearing the bit when new data
is added via skb_entail() solves the problem.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ddb6ce4ecff2..69b1fcf70077 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -584,7 +584,7 @@ static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
 	sk_charge_skb(sk, skb);
 	if (!sk->sk_send_head)
 		sk->sk_send_head = skb;
-	else if (tp->nonagle&TCP_NAGLE_PUSH)
+	if (tp->nonagle & TCP_NAGLE_PUSH)
 		tp->nonagle &= ~TCP_NAGLE_PUSH; 
 }
 
-- 
cgit 


From d5d283751ef3c05b6766501a46800cbee84959d6 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 23 Aug 2005 10:49:54 -0700
Subject: [TCP]: Document non-trivial locking path in tcp_v{4,6}_get_port().

This trips up a lot of folks reading this code.
Put an unlikely() around the port-exhaustion test
for good measure.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 9 +++++++--
 net/ipv6/tcp_ipv6.c | 9 +++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5d91213d34c0..67c670886c1f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -242,9 +242,14 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 		tcp_port_rover = rover;
 		spin_unlock(&tcp_portalloc_lock);
 
-		/* Exhausted local port range during search? */
+		/* Exhausted local port range during search?  It is not
+		 * possible for us to be holding one of the bind hash
+		 * locks if this test triggers, because if 'remaining'
+		 * drops to zero, we broke out of the do/while loop at
+		 * the top level, not from the 'break;' statement.
+		 */
 		ret = 1;
-		if (remaining <= 0)
+		if (unlikely(remaining <= 0))
 			goto fail;
 
 		/* OK, here is the one we will use.  HEAD is
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f6e288dc116e..ef29cfd936d3 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -158,9 +158,14 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
 		tcp_port_rover = rover;
 		spin_unlock(&tcp_portalloc_lock);
 
-		/* Exhausted local port range during search? */
+		/* Exhausted local port range during search?  It is not
+		 * possible for us to be holding one of the bind hash
+		 * locks if this test triggers, because if 'remaining'
+		 * drops to zero, we broke out of the do/while loop at
+		 * the top level, not from the 'break;' statement.
+		 */
 		ret = 1;
-		if (remaining <= 0)
+		if (unlikely(remaining <= 0))
 			goto fail;
 
 		/* OK, here is the one we will use. */
-- 
cgit 


From dc16aaf29d64b8c5e0b88f49a4d541edf5b61e42 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 23 Aug 2005 10:50:09 -0700
Subject: [ROSE]: Fix missing unlocks in rose_route_frame()

Noticed by Coverity checker.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rose/rose_route.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index ff73ebb912b8..46b23217a353 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -994,8 +994,10 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
 	 *	1. The frame isn't for us,
 	 *	2. It isn't "owned" by any existing route.
 	 */
-	if (frametype != ROSE_CALL_REQUEST)	/* XXX */
-		return 0;
+	if (frametype != ROSE_CALL_REQUEST) {	/* XXX */
+		ret = 0;
+		goto out;
+	}
 
 	len  = (((skb->data[3] >> 4) & 0x0F) + 1) / 2;
 	len += (((skb->data[3] >> 0) & 0x0F) + 1) / 2;
-- 
cgit 


From c1cc168442a943ed3997f6543db87c061987f9d7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 23 Aug 2005 14:55:32 -0700
Subject: [ROSE]: Fix typo in rose_route_frame() locking fix.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rose/rose_route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index 46b23217a353..25da6f699fd0 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -995,7 +995,7 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
 	 *	2. It isn't "owned" by any existing route.
 	 */
 	if (frametype != ROSE_CALL_REQUEST) {	/* XXX */
-		ret = 0;
+		res = 0;
 		goto out;
 	}
 
-- 
cgit 


From 06c7427021f1cc83703f14659d8405ca773ba1ef Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 23 Aug 2005 22:06:09 -0700
Subject: [FIB_TRIE]: Don't ignore negative results from fib_semantic_match

When a semantic match occurs either success, not found or an error
(for matching unreachable routes/blackholes) is returned. fib_trie
ignores the errors and looks for a different matching route. Treat
results other than "no match" as success and end lookup.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index a701405fab0b..45efd5f4741b 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1333,9 +1333,9 @@ err:;
 }
 
 static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *plen, const struct flowi *flp,
-			     struct fib_result *res, int *err)
+			     struct fib_result *res)
 {
-	int i;
+	int err, i;
 	t_key mask;
 	struct leaf_info *li;
 	struct hlist_head *hhead = &l->list;
@@ -1348,18 +1348,18 @@ static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *pl
 		if (l->key != (key & mask))
 			continue;
 
-		if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
+		if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
 			*plen = i;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			t->stats.semantic_match_passed++;
 #endif
-			return 1;
+			return err;
 		}
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 		t->stats.semantic_match_miss++;
 #endif
 	}
-	return 0;
+	return 1;
 }
 
 static int
@@ -1386,7 +1386,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
 
 	/* Just a leaf? */
 	if (IS_LEAF(n)) {
-		if (check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
+		if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
 			goto found;
 		goto failed;
 	}
@@ -1508,7 +1508,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
 		       continue;
 		}
 		if (IS_LEAF(n)) {
-			if (check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
+			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
 				goto found;
 	       }
 backtrace:
-- 
cgit