summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/fib_trie.c68
-rw-r--r--net/ipv4/ping.c4
-rw-r--r--net/ipv4/tcp.c4
-rw-r--r--net/ipv4/tcp_dctcp.c9
-rw-r--r--net/ipv4/tcp_input.c22
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_output.c91
-rw-r--r--net/ipv4/tcp_timer.c4
8 files changed, 121 insertions, 83 deletions
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 73a62700b00a..1b0e7d1f5217 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -808,6 +808,13 @@ static unsigned char update_suffix(struct key_vector *tn)
{
unsigned char slen = tn->pos;
unsigned long stride, i;
+ unsigned char slen_max;
+
+ /* only vector 0 can have a suffix length greater than or equal to
+ * tn->pos + tn->bits, the second highest node will have a suffix
+ * length at most of tn->pos + tn->bits - 1
+ */
+ slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen);
/* search though the list of children looking for nodes that might
* have a suffix greater than the one we currently have. This is
@@ -825,12 +832,8 @@ static unsigned char update_suffix(struct key_vector *tn)
slen = n->slen;
i &= ~(stride - 1);
- /* if slen covers all but the last bit we can stop here
- * there will be nothing longer than that since only node
- * 0 and 1 << (bits - 1) could have that as their suffix
- * length.
- */
- if ((slen + 1) >= (tn->pos + tn->bits))
+ /* stop searching if we have hit the maximum possible value */
+ if (slen >= slen_max)
break;
}
@@ -1002,39 +1005,27 @@ static struct key_vector *resize(struct trie *t, struct key_vector *tn)
return collapse(t, tn);
/* update parent in case halve failed */
- tp = node_parent(tn);
-
- /* Return if at least one deflate was run */
- if (max_work != MAX_WORK)
- return tp;
-
- /* push the suffix length to the parent node */
- if (tn->slen > tn->pos) {
- unsigned char slen = update_suffix(tn);
-
- if (slen > tp->slen)
- tp->slen = slen;
- }
-
- return tp;
+ return node_parent(tn);
}
-static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l)
+static void node_pull_suffix(struct key_vector *tn, unsigned char slen)
{
- while ((tp->slen > tp->pos) && (tp->slen > l->slen)) {
- if (update_suffix(tp) > l->slen)
+ unsigned char node_slen = tn->slen;
+
+ while ((node_slen > tn->pos) && (node_slen > slen)) {
+ slen = update_suffix(tn);
+ if (node_slen == slen)
break;
- tp = node_parent(tp);
+
+ tn = node_parent(tn);
+ node_slen = tn->slen;
}
}
-static void leaf_push_suffix(struct key_vector *tn, struct key_vector *l)
+static void node_push_suffix(struct key_vector *tn, unsigned char slen)
{
- /* if this is a new leaf then tn will be NULL and we can sort
- * out parent suffix lengths as a part of trie_rebalance
- */
- while (tn->slen < l->slen) {
- tn->slen = l->slen;
+ while (tn->slen < slen) {
+ tn->slen = slen;
tn = node_parent(tn);
}
}
@@ -1155,6 +1146,7 @@ static int fib_insert_node(struct trie *t, struct key_vector *tp,
}
/* Case 3: n is NULL, and will just insert a new leaf */
+ node_push_suffix(tp, new->fa_slen);
NODE_INIT_PARENT(l, tp);
put_child_root(tp, key, l);
trie_rebalance(t, tp);
@@ -1196,7 +1188,7 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
/* if we added to the tail node then we need to update slen */
if (l->slen < new->fa_slen) {
l->slen = new->fa_slen;
- leaf_push_suffix(tp, l);
+ node_push_suffix(tp, new->fa_slen);
}
return 0;
@@ -1588,6 +1580,8 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp,
* out parent suffix lengths as a part of trie_rebalance
*/
if (hlist_empty(&l->leaf)) {
+ if (tp->slen == l->slen)
+ node_pull_suffix(tp, tp->pos);
put_child_root(tp, l->key, NULL);
node_free(l);
trie_rebalance(t, tp);
@@ -1600,7 +1594,7 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp,
/* update the trie with the latest suffix length */
l->slen = fa->fa_slen;
- leaf_pull_suffix(tp, l);
+ node_pull_suffix(tp, fa->fa_slen);
}
/* Caller must hold RTNL. */
@@ -1872,6 +1866,10 @@ void fib_table_flush_external(struct fib_table *tb)
if (IS_TRIE(pn))
break;
+ /* update the suffix to address pulled leaves */
+ if (pn->slen > pn->pos)
+ update_suffix(pn);
+
/* resize completed node */
pn = resize(t, pn);
cindex = get_index(pkey, pn);
@@ -1938,6 +1936,10 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
if (IS_TRIE(pn))
break;
+ /* update the suffix to address pulled leaves */
+ if (pn->slen > pn->pos)
+ update_suffix(pn);
+
/* resize completed node */
pn = resize(t, pn);
cindex = get_index(pkey, pn);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index d11129f1178d..5b2635e69a92 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -657,6 +657,10 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
if (len > 0xFFFF)
return -EMSGSIZE;
+ /* Must have at least a full ICMP header. */
+ if (len < icmph_len)
+ return -EINVAL;
+
/*
* Check the flags.
*/
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1149b48700a1..1ef3165114ba 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -663,9 +663,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
if (tcp_should_autocork(sk, skb, size_goal)) {
/* avoid atomic op if TSQ_THROTTLED bit is already set */
- if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
+ if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
- set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
}
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED.
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index bde22ebb92a8..5f5e5936760e 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -188,8 +188,8 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
static void dctcp_update_alpha(struct sock *sk, u32 flags)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
struct dctcp *ca = inet_csk_ca(sk);
- struct tcp_sock *tp = tcp_sk(sk);
u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
/* If ack did not advance snd_una, count dupack as MSS size.
@@ -229,13 +229,6 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags)
WRITE_ONCE(ca->dctcp_alpha, alpha);
dctcp_reset(tp, ca);
}
-
- if (flags & CA_ACK_ECE) {
- unsigned int cwnd = dctcp_ssthresh(sk);
-
- if (cwnd != tp->snd_cwnd)
- tp->snd_cwnd = cwnd;
- }
}
static void dctcp_state(struct sock *sk, u8 new_state)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fe668c1b9ced..6c790754ae3e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -129,6 +129,23 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define REXMIT_LOST 1 /* retransmit packets marked lost */
#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
+static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb)
+{
+ static bool __once __read_mostly;
+
+ if (!__once) {
+ struct net_device *dev;
+
+ __once = true;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
+ pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
+ dev ? dev->name : "Unknown driver");
+ rcu_read_unlock();
+ }
+}
+
/* Adapt the MSS value used to make delayed ack decision to the
* real world.
*/
@@ -145,7 +162,10 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
*/
len = skb_shinfo(skb)->gso_size ? : skb->len;
if (len >= icsk->icsk_ack.rcv_mss) {
- icsk->icsk_ack.rcv_mss = len;
+ icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
+ tcp_sk(sk)->advmss);
+ if (unlikely(icsk->icsk_ack.rcv_mss != len))
+ tcp_gro_dev_warn(sk, skb);
} else {
/* Otherwise, we make more careful check taking into account,
* that SACKs block is variable.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b50f05905ced..30d81f533ada 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -443,7 +443,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
if (!sock_owned_by_user(sk)) {
tcp_v4_mtu_reduced(sk);
} else {
- if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
+ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
goto out;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c7adcb57654e..b45101f3d2bd 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -769,25 +769,26 @@ static void tcp_tasklet_func(unsigned long data)
list_del(&tp->tsq_node);
sk = (struct sock *)tp;
- bh_lock_sock(sk);
-
- if (!sock_owned_by_user(sk)) {
- tcp_tsq_handler(sk);
- } else {
- /* defer the work to tcp_release_cb() */
- set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+ clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
+
+ if (!sk->sk_lock.owned &&
+ test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
+ tcp_tsq_handler(sk);
+ }
+ bh_unlock_sock(sk);
}
- bh_unlock_sock(sk);
- clear_bit(TSQ_QUEUED, &tp->tsq_flags);
sk_free(sk);
}
}
-#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
- (1UL << TCP_WRITE_TIMER_DEFERRED) | \
- (1UL << TCP_DELACK_TIMER_DEFERRED) | \
- (1UL << TCP_MTU_REDUCED_DEFERRED))
+#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
+ TCPF_WRITE_TIMER_DEFERRED | \
+ TCPF_DELACK_TIMER_DEFERRED | \
+ TCPF_MTU_REDUCED_DEFERRED)
/**
* tcp_release_cb - tcp release_sock() callback
* @sk: socket
@@ -797,18 +798,17 @@ static void tcp_tasklet_func(unsigned long data)
*/
void tcp_release_cb(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
unsigned long flags, nflags;
/* perform an atomic operation only if at least one flag is set */
do {
- flags = tp->tsq_flags;
+ flags = sk->sk_tsq_flags;
if (!(flags & TCP_DEFERRED_ALL))
return;
nflags = flags & ~TCP_DEFERRED_ALL;
- } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
+ } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
- if (flags & (1UL << TCP_TSQ_DEFERRED))
+ if (flags & TCPF_TSQ_DEFERRED)
tcp_tsq_handler(sk);
/* Here begins the tricky part :
@@ -822,15 +822,15 @@ void tcp_release_cb(struct sock *sk)
*/
sock_release_ownership(sk);
- if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
+ if (flags & TCPF_WRITE_TIMER_DEFERRED) {
tcp_write_timer_handler(sk);
__sock_put(sk);
}
- if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
+ if (flags & TCPF_DELACK_TIMER_DEFERRED) {
tcp_delack_timer_handler(sk);
__sock_put(sk);
}
- if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
+ if (flags & TCPF_MTU_REDUCED_DEFERRED) {
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk);
}
@@ -860,6 +860,7 @@ void tcp_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long flags, nval, oval;
int wmem;
/* Keep one reference on sk_wmem_alloc.
@@ -877,16 +878,25 @@ void tcp_wfree(struct sk_buff *skb)
if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
goto out;
- if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
- !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
- unsigned long flags;
+ for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
struct tsq_tasklet *tsq;
+ bool empty;
+
+ if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
+ goto out;
+
+ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
+ nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
+ if (nval != oval)
+ continue;
/* queue this socket to tasklet queue */
local_irq_save(flags);
tsq = this_cpu_ptr(&tsq_tasklet);
+ empty = list_empty(&tsq->head);
list_add(&tp->tsq_node, &tsq->head);
- tasklet_schedule(&tsq->tasklet);
+ if (empty)
+ tasklet_schedule(&tsq->tasklet);
local_irq_restore(flags);
return;
}
@@ -1922,26 +1932,26 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk)
*/
static int tcp_mtu_probe(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb, *nskb, *next;
struct net *net = sock_net(sk);
- int len;
int probe_size;
int size_needed;
- int copy;
+ int copy, len;
int mss_now;
int interval;
/* Not currently probing/verifying,
* not in recovery,
* have enough cwnd, and
- * not SACKing (the variable headers throw things off) */
- if (!icsk->icsk_mtup.enabled ||
- icsk->icsk_mtup.probe_size ||
- inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
- tp->snd_cwnd < 11 ||
- tp->rx_opt.num_sacks || tp->rx_opt.dsack)
+ * not SACKing (the variable headers throw things off)
+ */
+ if (likely(!icsk->icsk_mtup.enabled ||
+ icsk->icsk_mtup.probe_size ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+ tp->snd_cwnd < 11 ||
+ tp->rx_opt.num_sacks || tp->rx_opt.dsack))
return -1;
/* Use binary search for probe_size between tcp_mss_base,
@@ -2081,7 +2091,16 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit <<= factor;
if (atomic_read(&sk->sk_wmem_alloc) > limit) {
- set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
+ /* Always send the 1st or 2nd skb in write queue.
+ * No need to wait for TX completion to call us back,
+ * after softirq/tasklet schedule.
+ * This helps when TX completions are delayed too much.
+ */
+ if (skb == sk->sk_write_queue.next ||
+ skb->prev == sk->sk_write_queue.next)
+ return false;
+
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED, so we must
* test again the condition.
@@ -2222,6 +2241,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
+ if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
+ clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
if (tcp_small_queue_check(sk, skb, 0))
break;
@@ -3524,8 +3545,6 @@ void tcp_send_ack(struct sock *sk)
/* We do not want pure acks influencing TCP Small Queues or fq/pacing
* too much.
* SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
- * We also avoid tcp_wfree() overhead (cache line miss accessing
- * tp->tsq_flags) by using regular sock_wfree()
*/
skb_set_tcp_pure_ack(buff);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3ea1cf804748..3705075f42c3 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -310,7 +310,7 @@ static void tcp_delack_timer(unsigned long data)
inet_csk(sk)->icsk_ack.blocked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
/* deleguate our work to tcp_release_cb() */
- if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
@@ -592,7 +592,7 @@ static void tcp_write_timer(unsigned long data)
tcp_write_timer_handler(sk);
} else {
/* delegate our work to tcp_release_cb() */
- if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);