diff options
Diffstat (limited to 'net/unix/garbage.c')
| -rw-r--r-- | net/unix/garbage.c | 228 | 
1 files changed, 151 insertions, 77 deletions
| diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 2405f0f9af31..0104be9d4704 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -81,12 +81,80 @@  #include <net/scm.h>  #include <net/tcp_states.h> -#include "scm.h" +struct unix_sock *unix_get_socket(struct file *filp) +{ +	struct inode *inode = file_inode(filp); + +	/* Socket ? */ +	if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { +		struct socket *sock = SOCKET_I(inode); +		const struct proto_ops *ops; +		struct sock *sk = sock->sk; + +		ops = READ_ONCE(sock->ops); -/* Internal data structures and random procedures: */ +		/* PF_UNIX ? */ +		if (sk && ops && ops->family == PF_UNIX) +			return unix_sk(sk); +	} + +	return NULL; +} +DEFINE_SPINLOCK(unix_gc_lock); +unsigned int unix_tot_inflight;  static LIST_HEAD(gc_candidates); -static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); +static LIST_HEAD(gc_inflight_list); + +/* Keep the number of times in flight count for the file + * descriptor if it is for an AF_UNIX socket. + */ +void unix_inflight(struct user_struct *user, struct file *filp) +{ +	struct unix_sock *u = unix_get_socket(filp); + +	spin_lock(&unix_gc_lock); + +	if (u) { +		if (!u->inflight) { +			WARN_ON_ONCE(!list_empty(&u->link)); +			list_add_tail(&u->link, &gc_inflight_list); +		} else { +			WARN_ON_ONCE(list_empty(&u->link)); +		} +		u->inflight++; + +		/* Paired with READ_ONCE() in wait_for_unix_gc() */ +		WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); +	} + +	WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); + +	spin_unlock(&unix_gc_lock); +} + +void unix_notinflight(struct user_struct *user, struct file *filp) +{ +	struct unix_sock *u = unix_get_socket(filp); + +	spin_lock(&unix_gc_lock); + +	if (u) { +		WARN_ON_ONCE(!u->inflight); +		WARN_ON_ONCE(list_empty(&u->link)); + +		u->inflight--; +		if (!u->inflight) +			list_del_init(&u->link); + +		/* Paired with READ_ONCE() in wait_for_unix_gc() */ +		WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); +	} + +	WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); + +	spin_unlock(&unix_gc_lock); +}  static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),  			  struct sk_buff_head *hitlist) @@ -105,20 +173,15 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),  			while (nfd--) {  				/* Get the socket the fd matches if it indeed does so */ -				struct sock *sk = unix_get_socket(*fp++); +				struct unix_sock *u = unix_get_socket(*fp++); -				if (sk) { -					struct unix_sock *u = unix_sk(sk); +				/* Ignore non-candidates, they could have been added +				 * to the queues after starting the garbage collection +				 */ +				if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { +					hit = true; -					/* Ignore non-candidates, they could -					 * have been added to the queues after -					 * starting the garbage collection -					 */ -					if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { -						hit = true; - -						func(u); -					} +					func(u);  				}  			}  			if (hit && hitlist != NULL) { @@ -151,7 +214,7 @@ static void scan_children(struct sock *x, void (*func)(struct unix_sock *),  			/* An embryo cannot be in-flight, so it's safe  			 * to use the list link.  			 */ -			BUG_ON(!list_empty(&u->link)); +			WARN_ON_ONCE(!list_empty(&u->link));  			list_add_tail(&u->link, &embryos);  		}  		spin_unlock(&x->sk_receive_queue.lock); @@ -166,17 +229,18 @@ static void scan_children(struct sock *x, void (*func)(struct unix_sock *),  static void dec_inflight(struct unix_sock *usk)  { -	atomic_long_dec(&usk->inflight); +	usk->inflight--;  }  static void inc_inflight(struct unix_sock *usk)  { -	atomic_long_inc(&usk->inflight); +	usk->inflight++;  }  static void inc_inflight_move_tail(struct unix_sock *u)  { -	atomic_long_inc(&u->inflight); +	u->inflight++; +  	/* If this still might be part of a cycle, move it to the end  	 * of the list, so that it's checked even if it was already  	 * passed over @@ -186,40 +250,16 @@ static void inc_inflight_move_tail(struct unix_sock *u)  }  static bool gc_in_progress; -#define UNIX_INFLIGHT_TRIGGER_GC 16000 -void wait_for_unix_gc(void) +static void __unix_gc(struct work_struct *work)  { -	/* If number of inflight sockets is insane, -	 * force a garbage collect right now. -	 * Paired with the WRITE_ONCE() in unix_inflight(), -	 * unix_notinflight() and gc_in_progress(). -	 */ -	if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && -	    !READ_ONCE(gc_in_progress)) -		unix_gc(); -	wait_event(unix_gc_wait, gc_in_progress == false); -} - -/* The external entry point: unix_gc() */ -void unix_gc(void) -{ -	struct sk_buff *next_skb, *skb; -	struct unix_sock *u; -	struct unix_sock *next;  	struct sk_buff_head hitlist; -	struct list_head cursor; +	struct unix_sock *u, *next;  	LIST_HEAD(not_cycle_list); +	struct list_head cursor;  	spin_lock(&unix_gc_lock); -	/* Avoid a recursive GC. */ -	if (gc_in_progress) -		goto out; - -	/* Paired with READ_ONCE() in wait_for_unix_gc(). */ -	WRITE_ONCE(gc_in_progress, true); -  	/* First, select candidates for garbage collection.  Only  	 * in-flight sockets are considered, and from those only ones  	 * which don't have any external reference. @@ -234,20 +274,34 @@ void unix_gc(void)  	 * receive queues.  Other, non candidate sockets _can_ be  	 * added to queue, so we must make sure only to touch  	 * candidates. +	 * +	 * Embryos, though never candidates themselves, affect which +	 * candidates are reachable by the garbage collector.  Before +	 * being added to a listener's queue, an embryo may already +	 * receive data carrying SCM_RIGHTS, potentially making the +	 * passed socket a candidate that is not yet reachable by the +	 * collector.  It becomes reachable once the embryo is +	 * enqueued.  Therefore, we must ensure that no SCM-laden +	 * embryo appears in a (candidate) listener's queue between +	 * consecutive scan_children() calls.  	 */  	list_for_each_entry_safe(u, next, &gc_inflight_list, link) { +		struct sock *sk = &u->sk;  		long total_refs; -		long inflight_refs; -		total_refs = file_count(u->sk.sk_socket->file); -		inflight_refs = atomic_long_read(&u->inflight); +		total_refs = file_count(sk->sk_socket->file); -		BUG_ON(inflight_refs < 1); -		BUG_ON(total_refs < inflight_refs); -		if (total_refs == inflight_refs) { +		WARN_ON_ONCE(!u->inflight); +		WARN_ON_ONCE(total_refs < u->inflight); +		if (total_refs == u->inflight) {  			list_move_tail(&u->link, &gc_candidates);  			__set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);  			__set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); + +			if (sk->sk_state == TCP_LISTEN) { +				unix_state_lock_nested(sk, U_LOCK_GC_LISTENER); +				unix_state_unlock(sk); +			}  		}  	} @@ -271,7 +325,7 @@ void unix_gc(void)  		/* Move cursor to after the current position. */  		list_move(&cursor, &u->link); -		if (atomic_long_read(&u->inflight) > 0) { +		if (u->inflight) {  			list_move_tail(&u->link, ¬_cycle_list);  			__clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);  			scan_children(&u->sk, inc_inflight_move_tail, NULL); @@ -284,9 +338,17 @@ void unix_gc(void)  	 * which are creating the cycle(s).  	 */  	skb_queue_head_init(&hitlist); -	list_for_each_entry(u, &gc_candidates, link) +	list_for_each_entry(u, &gc_candidates, link) {  		scan_children(&u->sk, inc_inflight, &hitlist); +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) +		if (u->oob_skb) { +			kfree_skb(u->oob_skb); +			u->oob_skb = NULL; +		} +#endif +	} +  	/* not_cycle_list contains those sockets which do not make up a  	 * cycle.  Restore these to the inflight list.  	 */ @@ -298,38 +360,50 @@ void unix_gc(void)  	spin_unlock(&unix_gc_lock); -	/* We need io_uring to clean its registered files, ignore all io_uring -	 * originated skbs. It's fine as io_uring doesn't keep references to -	 * other io_uring instances and so killing all other files in the cycle -	 * will put all io_uring references forcing it to go through normal -	 * release.path eventually putting registered files. -	 */ -	skb_queue_walk_safe(&hitlist, skb, next_skb) { -		if (skb->destructor == io_uring_destruct_scm) { -			__skb_unlink(skb, &hitlist); -			skb_queue_tail(&skb->sk->sk_receive_queue, skb); -		} -	} -  	/* Here we are. Hitlist is filled. Die. */  	__skb_queue_purge(&hitlist);  	spin_lock(&unix_gc_lock); -	/* There could be io_uring registered files, just push them back to -	 * the inflight list -	 */ -	list_for_each_entry_safe(u, next, &gc_candidates, link) -		list_move_tail(&u->link, &gc_inflight_list); -  	/* All candidates should have been detached by now. */ -	BUG_ON(!list_empty(&gc_candidates)); +	WARN_ON_ONCE(!list_empty(&gc_candidates));  	/* Paired with READ_ONCE() in wait_for_unix_gc(). */  	WRITE_ONCE(gc_in_progress, false); -	wake_up(&unix_gc_wait); - - out:  	spin_unlock(&unix_gc_lock);  } + +static DECLARE_WORK(unix_gc_work, __unix_gc); + +void unix_gc(void) +{ +	WRITE_ONCE(gc_in_progress, true); +	queue_work(system_unbound_wq, &unix_gc_work); +} + +#define UNIX_INFLIGHT_TRIGGER_GC 16000 +#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) + +void wait_for_unix_gc(struct scm_fp_list *fpl) +{ +	/* If number of inflight sockets is insane, +	 * force a garbage collect right now. +	 * +	 * Paired with the WRITE_ONCE() in unix_inflight(), +	 * unix_notinflight(), and __unix_gc(). +	 */ +	if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && +	    !READ_ONCE(gc_in_progress)) +		unix_gc(); + +	/* Penalise users who want to send AF_UNIX sockets +	 * but whose sockets have not been received yet. +	 */ +	if (!fpl || !fpl->count_unix || +	    READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) +		return; + +	if (READ_ONCE(gc_in_progress)) +		flush_work(&unix_gc_work); +} | 
