diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-03-24 09:52:37 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-03-24 09:52:37 -0700 |
commit | 71ee2fde57c707ac8f221321f3e951288f00f04b (patch) | |
tree | c45ae7050db9b276132e3ca4834c23ff963b6a25 | |
parent | fd101da676362aaa051b4f5d8a941bd308603041 (diff) | |
parent | 3732d8f16531ddc591622bc64ce4d4c160c34bb4 (diff) |
Merge tag 'vfs-6.15-rc1.pipe' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs pipe updates from Christian Brauner:
- Introduce struct file_operations pipeanon_fops
- Don't update {a,c,m}time for anonymous pipes to avoid the performance
costs associated with it
- Change pipe_write() to never add a zero-sized buffer
- Limit the slots in pipe_resize_ring()
- Use pipe_buf() to retrieve the pipe buffer everywhere
- Drop an always true check in anon_pipe_write()
- Cache 2 pages instead of 1
- Avoid spurious calls to prepare_to_wait_event() in ___wait_event()
* tag 'vfs-6.15-rc1.pipe' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
fs/splice: Use pipe_buf() helper to retrieve pipe buffer
fs/pipe: Use pipe_buf() helper to retrieve pipe buffer
kernel/watch_queue: Use pipe_buf() to retrieve the pipe buffer
fs/pipe: Limit the slots in pipe_resize_ring()
wait: avoid spurious calls to prepare_to_wait_event() in ___wait_event()
pipe: cache 2 pages instead of 1
pipe: drop an always true check in anon_pipe_write()
pipe: change pipe_write() to never add a zero-sized buffer
pipe: don't update {a,c,m}time for anonymous pipes
pipe: introduce struct file_operations pipeanon_fops
-rw-r--r-- | fs/pipe.c | 181 | ||||
-rw-r--r-- | fs/splice.c | 40 | ||||
-rw-r--r-- | include/linux/pipe_fs_i.h | 2 | ||||
-rw-r--r-- | include/linux/wait.h | 3 | ||||
-rw-r--r-- | kernel/watch_queue.c | 7 |
5 files changed, 124 insertions, 109 deletions
diff --git a/fs/pipe.c b/fs/pipe.c index 4d0799e4e719..da45edd68c41 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -112,20 +112,40 @@ void pipe_double_lock(struct pipe_inode_info *pipe1, pipe_lock(pipe2); } +static struct page *anon_pipe_get_page(struct pipe_inode_info *pipe) +{ + for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { + if (pipe->tmp_page[i]) { + struct page *page = pipe->tmp_page[i]; + pipe->tmp_page[i] = NULL; + return page; + } + } + + return alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); +} + +static void anon_pipe_put_page(struct pipe_inode_info *pipe, + struct page *page) +{ + if (page_count(page) == 1) { + for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { + if (!pipe->tmp_page[i]) { + pipe->tmp_page[i] = page; + return; + } + } + } + + put_page(page); +} + static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; - /* - * If nobody else uses this page, and we don't already have a - * temporary page, let's keep track of it as a one-deep - * allocation cache. (Otherwise just release our reference to it) - */ - if (page_count(page) == 1 && !pipe->tmp_page) - pipe->tmp_page = page; - else - put_page(page); + anon_pipe_put_page(pipe, page); } static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe, @@ -247,7 +267,7 @@ static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, } static ssize_t -pipe_read(struct kiocb *iocb, struct iov_iter *to) +anon_pipe_read(struct kiocb *iocb, struct iov_iter *to) { size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; @@ -274,7 +294,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) /* Read ->head with a barrier vs post_one_notification() */ unsigned int head = smp_load_acquire(&pipe->head); unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; #ifdef CONFIG_WATCH_QUEUE if (pipe->note_loss) { @@ -301,7 +320,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) #endif if (!pipe_empty(head, tail)) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t chars = buf->len; size_t written; int error; @@ -359,29 +378,9 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) break; } mutex_unlock(&pipe->mutex); - /* * We only get here if we didn't actually read anything. * - * However, we could have seen (and removed) a zero-sized - * pipe buffer, and might have made space in the buffers - * that way. - * - * You can't make zero-sized pipe buffers by doing an empty - * write (not even in packet mode), but they can happen if - * the writer gets an EFAULT when trying to fill a buffer - * that already got allocated and inserted in the buffer - * array. - * - * So we still need to wake up any pending writers in the - * _very_ unlikely case that the pipe was full, but we got - * no data. - */ - if (unlikely(wake_writer)) - wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - - /* * But because we didn't read anything, at this point we can * just return directly with -ERESTARTSYS if we're interrupted, * since we've done any required wakeups and there's no need @@ -390,7 +389,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; - wake_writer = false; wake_next_reader = true; mutex_lock(&pipe->mutex); } @@ -403,8 +401,15 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + return ret; +} + +static ssize_t +fifo_pipe_read(struct kiocb *iocb, struct iov_iter *to) +{ + int ret = anon_pipe_read(iocb, to); if (ret > 0) - file_accessed(filp); + file_accessed(iocb->ki_filp); return ret; } @@ -424,7 +429,7 @@ static inline bool pipe_writable(const struct pipe_inode_info *pipe) } static ssize_t -pipe_write(struct kiocb *iocb, struct iov_iter *from) +anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; @@ -471,8 +476,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) was_empty = pipe_empty(head, pipe->tail); chars = total_len & (PAGE_SIZE-1); if (chars && !was_empty) { - unsigned int mask = pipe->ring_size - 1; - struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, head - 1); int offset = buf->offset + buf->len; if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) && @@ -503,54 +507,44 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) head = pipe->head; if (!pipe_full(head, pipe->tail, pipe->max_usage)) { - unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf; - struct page *page = pipe->tmp_page; + struct page *page; int copied; - if (!page) { - page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); - if (unlikely(!page)) { - ret = ret ? : -ENOMEM; - break; - } - pipe->tmp_page = page; + page = anon_pipe_get_page(pipe); + if (unlikely(!page)) { + if (!ret) + ret = -ENOMEM; + break; } - /* Allocate a slot in the ring in advance and attach an - * empty buffer. If we fault or otherwise fail to use - * it, either the reader will consume it or it'll still - * be there for the next write. - */ - pipe->head = head + 1; + copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); + if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { + anon_pipe_put_page(pipe, page); + if (!ret) + ret = -EFAULT; + break; + } + pipe->head = head + 1; /* Insert it into the buffer array */ - buf = &pipe->bufs[head & mask]; + buf = pipe_buf(pipe, head); buf->page = page; buf->ops = &anon_pipe_buf_ops; buf->offset = 0; - buf->len = 0; if (is_packetized(filp)) buf->flags = PIPE_BUF_FLAG_PACKET; else buf->flags = PIPE_BUF_FLAG_CAN_MERGE; - pipe->tmp_page = NULL; - copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); - if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { - if (!ret) - ret = -EFAULT; - break; - } - ret += copied; buf->len = copied; + ret += copied; if (!iov_iter_count(from)) break; - } - if (!pipe_full(head, pipe->tail, pipe->max_usage)) continue; + } /* Wait for buffer space to become available. */ if ((filp->f_flags & O_NONBLOCK) || @@ -602,11 +596,21 @@ out: kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); if (wake_next_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { - int err = file_update_time(filp); - if (err) - ret = err; - sb_end_write(file_inode(filp)->i_sb); + return ret; +} + +static ssize_t +fifo_pipe_write(struct kiocb *iocb, struct iov_iter *from) +{ + int ret = anon_pipe_write(iocb, from); + if (ret > 0) { + struct file *filp = iocb->ki_filp; + if (sb_start_write_trylock(file_inode(filp)->i_sb)) { + int err = file_update_time(filp); + if (err) + ret = err; + sb_end_write(file_inode(filp)->i_sb); + } } return ret; } @@ -853,8 +857,10 @@ void free_pipe_info(struct pipe_inode_info *pipe) if (pipe->watch_queue) put_watch_queue(pipe->watch_queue); #endif - if (pipe->tmp_page) - __free_page(pipe->tmp_page); + for (i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { + if (pipe->tmp_page[i]) + __free_page(pipe->tmp_page[i]); + } kfree(pipe->bufs); kfree(pipe); } @@ -874,6 +880,8 @@ static const struct dentry_operations pipefs_dentry_operations = { .d_dname = pipefs_dname, }; +static const struct file_operations pipeanon_fops; + static struct inode * get_pipe_inode(void) { struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); @@ -891,7 +899,7 @@ static struct inode * get_pipe_inode(void) inode->i_pipe = pipe; pipe->files = 2; pipe->readers = pipe->writers = 1; - inode->i_fop = &pipefifo_fops; + inode->i_fop = &pipeanon_fops; /* * Mark the inode dirty from the very beginning, @@ -934,7 +942,7 @@ int create_pipe_files(struct file **res, int flags) f = alloc_file_pseudo(inode, pipe_mnt, "", O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), - &pipefifo_fops); + &pipeanon_fops); if (IS_ERR(f)) { free_pipe_info(inode->i_pipe); iput(inode); @@ -945,7 +953,7 @@ int create_pipe_files(struct file **res, int flags) f->f_pipe = 0; res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), - &pipefifo_fops); + &pipeanon_fops); if (IS_ERR(res[0])) { put_pipe_info(inode, inode->i_pipe); fput(f); @@ -1109,8 +1117,8 @@ static void wake_up_partner(struct pipe_inode_info *pipe) static int fifo_open(struct inode *inode, struct file *filp) { + bool is_pipe = inode->i_fop == &pipeanon_fops; struct pipe_inode_info *pipe; - bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; int ret; filp->f_pipe = 0; @@ -1234,8 +1242,19 @@ err: const struct file_operations pipefifo_fops = { .open = fifo_open, - .read_iter = pipe_read, - .write_iter = pipe_write, + .read_iter = fifo_pipe_read, + .write_iter = fifo_pipe_write, + .poll = pipe_poll, + .unlocked_ioctl = pipe_ioctl, + .release = pipe_release, + .fasync = pipe_fasync, + .splice_write = iter_file_splice_write, +}; + +static const struct file_operations pipeanon_fops = { + .open = fifo_open, + .read_iter = anon_pipe_read, + .write_iter = anon_pipe_write, .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, @@ -1271,6 +1290,10 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) struct pipe_buffer *bufs; unsigned int head, tail, mask, n; + /* nr_slots larger than limits of pipe->{head,tail} */ + if (unlikely(nr_slots > (pipe_index_t)-1u)) + return -EINVAL; + bufs = kcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) @@ -1390,7 +1413,9 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) { struct pipe_inode_info *pipe = file->private_data; - if (file->f_op != &pipefifo_fops || !pipe) + if (!pipe) + return NULL; + if (file->f_op != &pipefifo_fops && file->f_op != &pipeanon_fops) return NULL; if (for_splice && pipe_has_watch_queue(pipe)) return NULL; diff --git a/fs/splice.c b/fs/splice.c index 23fa5561b944..90d464241f15 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -200,7 +200,6 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, unsigned int spd_pages = spd->nr_pages; unsigned int tail = pipe->tail; unsigned int head = pipe->head; - unsigned int mask = pipe->ring_size - 1; ssize_t ret = 0; int page_nr = 0; @@ -214,7 +213,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, } while (!pipe_full(head, tail, pipe->max_usage)) { - struct pipe_buffer *buf = &pipe->bufs[head & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, head); buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; @@ -247,7 +246,6 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; int ret; if (unlikely(!pipe->readers)) { @@ -256,7 +254,7 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) } else if (pipe_full(head, tail, pipe->max_usage)) { ret = -EAGAIN; } else { - pipe->bufs[head & mask] = *buf; + *pipe_buf(pipe, head) = *buf; pipe->head = head + 1; return buf->len; } @@ -447,11 +445,10 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des { unsigned int head = pipe->head; unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; int ret; while (!pipe_empty(head, tail)) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); sd->len = buf->len; if (sd->len > sd->total_len) @@ -495,8 +492,7 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) { unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); if (unlikely(!buf->len)) { pipe_buf_release(pipe, buf); @@ -690,7 +686,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, while (sd.total_len) { struct kiocb kiocb; struct iov_iter from; - unsigned int head, tail, mask; + unsigned int head, tail; size_t left; int n; @@ -711,12 +707,11 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, head = pipe->head; tail = pipe->tail; - mask = pipe->ring_size - 1; /* build the vector */ left = sd.total_len; for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t this_len = buf->len; /* zero-length bvecs are not supported, skip them */ @@ -752,7 +747,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, /* dismiss the fully eaten buffers, adjust the partial one */ tail = pipe->tail; while (ret) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); if (ret >= buf->len) { ret -= buf->len; buf->len = 0; @@ -809,7 +804,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, pipe_lock(pipe); while (len > 0) { - unsigned int head, tail, mask, bc = 0; + unsigned int head, tail, bc = 0; size_t remain = len; /* @@ -846,10 +841,9 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, head = pipe->head; tail = pipe->tail; - mask = pipe->ring_size - 1; while (!pipe_empty(head, tail)) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t seg; if (!buf->len) { @@ -894,7 +888,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, len -= ret; tail = pipe->tail; while (ret > 0) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t seg = min_t(size_t, ret, buf->len); buf->offset += seg; @@ -1725,7 +1719,6 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; - unsigned int i_mask, o_mask; int ret = 0; bool input_wakeup = false; @@ -1747,9 +1740,7 @@ retry: pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; - i_mask = ipipe->ring_size - 1; o_head = opipe->head; - o_mask = opipe->ring_size - 1; do { size_t o_len; @@ -1792,8 +1783,8 @@ retry: goto retry; } - ibuf = &ipipe->bufs[i_tail & i_mask]; - obuf = &opipe->bufs[o_head & o_mask]; + ibuf = pipe_buf(ipipe, i_tail); + obuf = pipe_buf(opipe, o_head); if (len >= ibuf->len) { /* @@ -1862,7 +1853,6 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe, struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; - unsigned int i_mask, o_mask; ssize_t ret = 0; /* @@ -1873,9 +1863,7 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe, pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; - i_mask = ipipe->ring_size - 1; o_head = opipe->head; - o_mask = opipe->ring_size - 1; do { if (!opipe->readers) { @@ -1896,8 +1884,8 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe, pipe_full(o_head, o_tail, opipe->max_usage)) break; - ibuf = &ipipe->bufs[i_tail & i_mask]; - obuf = &opipe->bufs[o_head & o_mask]; + ibuf = pipe_buf(ipipe, i_tail); + obuf = pipe_buf(opipe, o_head); /* * Get a reference to this pipe buffer, diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index b698758000f8..9d42d473d201 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -108,7 +108,7 @@ struct pipe_inode_info { #ifdef CONFIG_WATCH_QUEUE bool note_loss; #endif - struct page *tmp_page; + struct page *tmp_page[2]; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; struct pipe_buffer *bufs; diff --git a/include/linux/wait.h b/include/linux/wait.h index 6d90ad974408..3503fe822e38 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -316,6 +316,9 @@ extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags); } \ \ cmd; \ + \ + if (condition) \ + break; \ } \ finish_wait(&wq_head, &__wq_entry); \ __out: __ret; \ diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 41e4e8070923..7e45559521af 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -101,12 +101,11 @@ static bool post_one_notification(struct watch_queue *wqueue, struct pipe_inode_info *pipe = wqueue->pipe; struct pipe_buffer *buf; struct page *page; - unsigned int head, tail, mask, note, offset, len; + unsigned int head, tail, note, offset, len; bool done = false; spin_lock_irq(&pipe->rd_wait.lock); - mask = pipe->ring_size - 1; head = pipe->head; tail = pipe->tail; if (pipe_full(head, tail, pipe->ring_size)) @@ -124,7 +123,7 @@ static bool post_one_notification(struct watch_queue *wqueue, memcpy(p + offset, n, len); kunmap_atomic(p); - buf = &pipe->bufs[head & mask]; + buf = pipe_buf(pipe, head); buf->page = page; buf->private = (unsigned long)wqueue; buf->ops = &watch_queue_pipe_buf_ops; @@ -147,7 +146,7 @@ out: return done; lost: - buf = &pipe->bufs[(head - 1) & mask]; + buf = pipe_buf(pipe, head - 1); buf->flags |= PIPE_BUF_FLAG_LOSS; goto out; } |