From 8b0a6b461e2ccc95363e0547aa4f43ba2e02b096 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Fri, 3 Feb 2012 11:09:23 -0500 Subject: RDS: make socket bind/release locking scheme simple and more efficient RDS bind and release locking scheme is very inefficient. It uses RCU for maintaining the bind hash-table which is great but it also needs to hold spinlock for [add/remove]_bound(). So overall usecase, the hash-table concurrent speedup doesn't pay off. In fact blocking nature of synchronize_rcu() makes the RDS socket shutdown too slow which hurts RDS performance since connection shutdown and re-connect happens quite often to maintain the RC part of the protocol. So we make the locking scheme simpler and more efficient by replacing spin_locks with reader/writer locks and getting rid off rcu for bind hash-table. In subsequent patch, we also covert the global lock with per-bucket lock to reduce the global lock contention. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar --- net/rds/bind.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) (limited to 'net/rds/bind.c') diff --git a/net/rds/bind.c b/net/rds/bind.c index dd666fb9b4e1..01989e2104bd 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -40,7 +40,7 @@ #define BIND_HASH_SIZE 1024 static struct hlist_head bind_hash_table[BIND_HASH_SIZE]; -static DEFINE_SPINLOCK(rds_bind_lock); +static DEFINE_RWLOCK(rds_bind_lock); static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) { @@ -48,6 +48,7 @@ static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) (BIND_HASH_SIZE - 1)); } +/* must hold either read or write lock (write lock for insert != NULL) */ static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, struct rds_sock *insert) { @@ -56,30 +57,24 @@ static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, u64 cmp; u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); - rcu_read_lock(); - hlist_for_each_entry_rcu(rs, head, rs_bound_node) { + hlist_for_each_entry(rs, head, rs_bound_node) { cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | be16_to_cpu(rs->rs_bound_port); - if (cmp == needle) { - rcu_read_unlock(); + if (cmp == needle) return rs; - } } - rcu_read_unlock(); if (insert) { /* * make sure our addr and port are set before - * we are added to the list, other people - * in rcu will find us as soon as the - * hlist_add_head_rcu is done + * we are added to the list. */ insert->rs_bound_addr = addr; insert->rs_bound_port = port; rds_sock_addref(insert); - hlist_add_head_rcu(&insert->rs_bound_node, head); + hlist_add_head(&insert->rs_bound_node, head); } return NULL; } @@ -93,8 +88,11 @@ static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, struct rds_sock *rds_find_bound(__be32 addr, __be16 port) { struct rds_sock *rs; + unsigned long flags; + read_lock_irqsave(&rds_bind_lock, flags); rs = rds_bind_lookup(addr, port, NULL); + read_unlock_irqrestore(&rds_bind_lock, flags); if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) rds_sock_addref(rs); @@ -103,6 +101,7 @@ struct rds_sock *rds_find_bound(__be32 addr, __be16 port) rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, ntohs(port)); + return rs; } @@ -121,7 +120,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) last = rover - 1; } - spin_lock_irqsave(&rds_bind_lock, flags); + write_lock_irqsave(&rds_bind_lock, flags); do { if (rover == 0) @@ -135,7 +134,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) } } while (rover++ != last); - spin_unlock_irqrestore(&rds_bind_lock, flags); + write_unlock_irqrestore(&rds_bind_lock, flags); return ret; } @@ -144,19 +143,19 @@ void rds_remove_bound(struct rds_sock *rs) { unsigned long flags; - spin_lock_irqsave(&rds_bind_lock, flags); + write_lock_irqsave(&rds_bind_lock, flags); if (rs->rs_bound_addr) { rdsdebug("rs %p unbinding from %pI4:%d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port)); - hlist_del_init_rcu(&rs->rs_bound_node); + hlist_del_init(&rs->rs_bound_node); rds_sock_put(rs); rs->rs_bound_addr = 0; } - spin_unlock_irqrestore(&rds_bind_lock, flags); + write_unlock_irqrestore(&rds_bind_lock, flags); } int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) @@ -200,9 +199,5 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) out: release_sock(sk); - - /* we might have called rds_remove_bound on error */ - if (ret) - synchronize_rcu(); return ret; } -- cgit From 28126959882d3ec4745f2ec800f3a1d74368b2fe Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Fri, 3 Feb 2012 11:08:50 -0500 Subject: RDS: fix rds_sock reference bug while doing bind One need to take rds socket reference while using it and release it once done with it. rds_add_bind() code path does not do that so lets fix it. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar --- net/rds/bind.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net/rds/bind.c') diff --git a/net/rds/bind.c b/net/rds/bind.c index 01989e2104bd..166c60568d5e 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -61,8 +61,10 @@ static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | be16_to_cpu(rs->rs_bound_port); - if (cmp == needle) + if (cmp == needle) { + rds_sock_addref(rs); return rs; + } } if (insert) { @@ -94,10 +96,10 @@ struct rds_sock *rds_find_bound(__be32 addr, __be16 port) rs = rds_bind_lookup(addr, port, NULL); read_unlock_irqrestore(&rds_bind_lock, flags); - if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) - rds_sock_addref(rs); - else + if (rs && sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) { + rds_sock_put(rs); rs = NULL; + } rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, ntohs(port)); @@ -123,14 +125,18 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) write_lock_irqsave(&rds_bind_lock, flags); do { + struct rds_sock *rrs; if (rover == 0) rover++; - if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) { + rrs = rds_bind_lookup(addr, cpu_to_be16(rover), rs); + if (!rrs) { *port = rs->rs_bound_port; ret = 0; rdsdebug("rs %p binding to %pI4:%d\n", rs, &addr, (int)ntohs(*port)); break; + } else { + rds_sock_put(rrs); } } while (rover++ != last); -- cgit From 9b9acde7e887e057568cd077d9c3377d2cb9aa5b Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 11 Feb 2014 19:34:25 -0800 Subject: RDS: Use per-bucket rw lock for bind hash-table One global lock protecting hash-tables with 1024 buckets isn't efficient and it shows up in a massive systems with truck loads of RDS sockets serving multiple databases. The perf data clearly highlights the contention on the rw lock in these massive workloads. When the contention gets worse, the code gets into a state where it decides to back off on the lock. So while it has disabled interrupts, it sits and backs off on this lock get. This causes the system to become sluggish and eventually all sorts of bad things happen. The simple fix is to move the lock into the hash bucket and use per-bucket lock to improve the scalability. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar --- net/rds/bind.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'net/rds/bind.c') diff --git a/net/rds/bind.c b/net/rds/bind.c index 166c60568d5e..bc6b93ecedb5 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -38,22 +38,27 @@ #include #include "rds.h" +struct bind_bucket { + rwlock_t lock; + struct hlist_head head; +}; + #define BIND_HASH_SIZE 1024 -static struct hlist_head bind_hash_table[BIND_HASH_SIZE]; -static DEFINE_RWLOCK(rds_bind_lock); +static struct bind_bucket bind_hash_table[BIND_HASH_SIZE]; -static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) +static struct bind_bucket *hash_to_bucket(__be32 addr, __be16 port) { return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) & (BIND_HASH_SIZE - 1)); } /* must hold either read or write lock (write lock for insert != NULL) */ -static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, +static struct rds_sock *rds_bind_lookup(struct bind_bucket *bucket, + __be32 addr, __be16 port, struct rds_sock *insert) { struct rds_sock *rs; - struct hlist_head *head = hash_to_bucket(addr, port); + struct hlist_head *head = &bucket->head; u64 cmp; u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); @@ -91,10 +96,11 @@ struct rds_sock *rds_find_bound(__be32 addr, __be16 port) { struct rds_sock *rs; unsigned long flags; + struct bind_bucket *bucket = hash_to_bucket(addr, port); - read_lock_irqsave(&rds_bind_lock, flags); - rs = rds_bind_lookup(addr, port, NULL); - read_unlock_irqrestore(&rds_bind_lock, flags); + read_lock_irqsave(&bucket->lock, flags); + rs = rds_bind_lookup(bucket, addr, port, NULL); + read_unlock_irqrestore(&bucket->lock, flags); if (rs && sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) { rds_sock_put(rs); @@ -113,6 +119,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) unsigned long flags; int ret = -EADDRINUSE; u16 rover, last; + struct bind_bucket *bucket; if (*port != 0) { rover = be16_to_cpu(*port); @@ -122,13 +129,15 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) last = rover - 1; } - write_lock_irqsave(&rds_bind_lock, flags); - do { struct rds_sock *rrs; if (rover == 0) rover++; - rrs = rds_bind_lookup(addr, cpu_to_be16(rover), rs); + + bucket = hash_to_bucket(addr, cpu_to_be16(rover)); + write_lock_irqsave(&bucket->lock, flags); + rrs = rds_bind_lookup(bucket, addr, cpu_to_be16(rover), rs); + write_unlock_irqrestore(&bucket->lock, flags); if (!rrs) { *port = rs->rs_bound_port; ret = 0; @@ -140,16 +149,16 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) } } while (rover++ != last); - write_unlock_irqrestore(&rds_bind_lock, flags); - return ret; } void rds_remove_bound(struct rds_sock *rs) { unsigned long flags; + struct bind_bucket *bucket = + hash_to_bucket(rs->rs_bound_addr, rs->rs_bound_port); - write_lock_irqsave(&rds_bind_lock, flags); + write_lock_irqsave(&bucket->lock, flags); if (rs->rs_bound_addr) { rdsdebug("rs %p unbinding from %pI4:%d\n", @@ -161,7 +170,7 @@ void rds_remove_bound(struct rds_sock *rs) rs->rs_bound_addr = 0; } - write_unlock_irqrestore(&rds_bind_lock, flags); + write_unlock_irqrestore(&bucket->lock, flags); } int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) @@ -207,3 +216,11 @@ out: release_sock(sk); return ret; } + +void rds_bind_lock_init(void) +{ + int i; + + for (i = 0; i < BIND_HASH_SIZE; i++) + rwlock_init(&bind_hash_table[i].lock); +} -- cgit