diff options
Diffstat (limited to 'fs')
240 files changed, 13448 insertions, 5077 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 6ecf863bfa2f..b177fd3b1eb3 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -612,9 +612,9 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma) struct writeback_control wbc = { .nr_to_write = LONG_MAX, .sync_mode = WB_SYNC_ALL, - .range_start = vma->vm_pgoff * PAGE_SIZE, + .range_start = (loff_t)vma->vm_pgoff * PAGE_SIZE, /* absolute end, byte at end included */ - .range_end = vma->vm_pgoff * PAGE_SIZE + + .range_end = (loff_t)vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start - 1), }; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index e34fa20acf61..9a21269b7234 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -260,8 +260,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = rs.bavail; buf->f_files = rs.files; buf->f_ffree = rs.ffree; - buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL; - buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL; + buf->f_fsid = u64_to_fsid(rs.fsid); buf->f_namelen = rs.namelen; } if (res != -ENOSYS) diff --git a/fs/Makefile b/fs/Makefile index 7bb2a05fda1f..999d1a23f036 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -14,7 +14,7 @@ obj-y := open.o read_write.o file_table.o super.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ - kernel_read_file.o + kernel_read_file.o remap_range.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o diff --git a/fs/adfs/super.c b/fs/adfs/super.c index d553bb5bc17a..bdbd26e571ed 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -210,8 +210,7 @@ static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = sbi->s_namelen; buf->f_bsize = sb->s_blocksize; buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/affs/super.c b/fs/affs/super.c index a100cd9950c8..c6c2a513ec92 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -620,8 +620,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved; buf->f_bfree = free; buf->f_bavail = free; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = AFFSNAMEMAX; return 0; } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 2482032021ca..c1ba13d19024 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -963,8 +963,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; buf->f_files = 0; /* UNKNOWN */ buf->f_ffree = 0; /* UNKNOWN */ - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = BEFS_NAME_LEN; befs_debug(sb, "<--- %s", __func__); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index f8ce1368218b..3ac7611ef7ce 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -229,8 +229,7 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree = buf->f_bavail = info->si_freeb; buf->f_files = info->si_lasti + 1 - BFS_ROOT_INO; buf->f_ffree = info->si_freei; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = BFS_NAMELEN; return 0; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6ea761c84494..35c83f65475b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -182,58 +182,15 @@ static int ceph_releasepage(struct page *page, gfp_t g) return !PagePrivate(page); } -/* - * Read some contiguous pages. If we cross a stripe boundary, shorten - * *plen. Return number of bytes read, or error. - */ -static int ceph_sync_readpages(struct ceph_fs_client *fsc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - u64 off, u64 *plen, - u32 truncate_seq, u64 truncate_size, - struct page **pages, int num_pages, - int page_align) -{ - struct ceph_osd_client *osdc = &fsc->client->osdc; - struct ceph_osd_request *req; - int rc = 0; - - dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, - vino.snap, off, *plen); - req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1, - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - NULL, truncate_seq, truncate_size, - false); - if (IS_ERR(req)) - return PTR_ERR(req); - - /* it may be a short read due to an object boundary */ - osd_req_op_extent_osd_data_pages(req, 0, - pages, *plen, page_align, false, false); - - dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", - off, *plen, *plen, page_align); - - rc = ceph_osdc_start_request(osdc, req, false); - if (!rc) - rc = ceph_osdc_wait_request(osdc, req); - - ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, rc); - - ceph_osdc_put_request(req); - dout("readpages result %d\n", rc); - return rc; -} - -/* - * read a single page, without unlocking it. - */ +/* read a single page, without unlocking it. */ static int ceph_do_readpage(struct file *filp, struct page *page) { struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_client *osdc = &fsc->client->osdc; + struct ceph_osd_request *req; + struct ceph_vino vino = ceph_vino(inode); int err = 0; u64 off = page_offset(page); u64 len = PAGE_SIZE; @@ -260,19 +217,33 @@ static int ceph_do_readpage(struct file *filp, struct page *page) if (err == 0) return -EINPROGRESS; - dout("readpage inode %p file %p page %p index %lu\n", - inode, filp, page, page->index); - err = ceph_sync_readpages(fsc, ceph_vino(inode), - &ci->i_layout, off, &len, - ci->i_truncate_seq, ci->i_truncate_size, - &page, 1, 0); + dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n", + vino.ino, vino.snap, filp, off, len, page, page->index); + req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1, + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, + ci->i_truncate_seq, ci->i_truncate_size, + false); + if (IS_ERR(req)) + return PTR_ERR(req); + + osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); + + err = ceph_osdc_start_request(osdc, req, false); + if (!err) + err = ceph_osdc_wait_request(osdc, req); + + ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, err); + + ceph_osdc_put_request(req); + dout("readpage result %d\n", err); + if (err == -ENOENT) err = 0; if (err < 0) { - SetPageError(page); ceph_fscache_readpage_cancel(inode, page); - if (err == -EBLACKLISTED) - fsc->blacklisted = true; + if (err == -EBLOCKLISTED) + fsc->blocklisted = true; goto out; } if (err < PAGE_SIZE) @@ -312,8 +283,8 @@ static void finish_read(struct ceph_osd_request *req) int i; dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); - if (rc == -EBLACKLISTED) - ceph_inode_to_client(inode)->blacklisted = true; + if (rc == -EBLOCKLISTED) + ceph_inode_to_client(inode)->blocklisted = true; /* unlock all pages, zeroing any data we didn't read */ osd_data = osd_req_op_extent_osd_data(req, 0); @@ -620,50 +591,6 @@ static u64 get_writepages_data_length(struct inode *inode, } /* - * do a synchronous write on N pages - */ -static int ceph_sync_writepages(struct ceph_fs_client *fsc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - struct ceph_snap_context *snapc, - u64 off, u64 len, - u32 truncate_seq, u64 truncate_size, - struct timespec64 *mtime, - struct page **pages, int num_pages) -{ - struct ceph_osd_client *osdc = &fsc->client->osdc; - struct ceph_osd_request *req; - int rc = 0; - int page_align = off & ~PAGE_MASK; - - req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, - CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, - snapc, truncate_seq, truncate_size, - true); - if (IS_ERR(req)) - return PTR_ERR(req); - - /* it may be a short write due to an object boundary */ - osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, - false, false); - dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); - - req->r_mtime = *mtime; - rc = ceph_osdc_start_request(osdc, req, true); - if (!rc) - rc = ceph_osdc_wait_request(osdc, req); - - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, rc); - - ceph_osdc_put_request(req); - if (rc == 0) - rc = len; - dout("writepages result %d\n", rc); - return rc; -} - -/* * Write a single page, but leave the page locked. * * If we get a write error, mark the mapping for error, but still adjust the @@ -671,20 +598,19 @@ static int ceph_sync_writepages(struct ceph_fs_client *fsc, */ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) { - struct inode *inode; - struct ceph_inode_info *ci; - struct ceph_fs_client *fsc; + struct inode *inode = page->mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_snap_context *snapc, *oldest; loff_t page_off = page_offset(page); - int err, len = PAGE_SIZE; + int err; + loff_t len = PAGE_SIZE; struct ceph_writeback_ctl ceph_wbc; + struct ceph_osd_client *osdc = &fsc->client->osdc; + struct ceph_osd_request *req; dout("writepage %p idx %lu\n", page, page->index); - inode = page->mapping->host; - ci = ceph_inode(inode); - fsc = ceph_inode_to_client(inode); - /* verify this is a writeable snap context */ snapc = page_snap_context(page); if (!snapc) { @@ -713,7 +639,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (ceph_wbc.i_size < page_off + len) len = ceph_wbc.i_size - page_off; - dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n", + dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n", inode, page, page->index, page_off, len, snapc, snapc->seq); if (atomic_long_inc_return(&fsc->writeback_count) > @@ -721,11 +647,33 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); set_page_writeback(page); - err = ceph_sync_writepages(fsc, ceph_vino(inode), - &ci->i_layout, snapc, page_off, len, - ceph_wbc.truncate_seq, - ceph_wbc.truncate_size, - &inode->i_mtime, &page, 1); + req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, + ceph_wbc.truncate_seq, ceph_wbc.truncate_size, + true); + if (IS_ERR(req)) { + redirty_page_for_writepage(wbc, page); + end_page_writeback(page); + return PTR_ERR(req); + } + + /* it may be a short write due to an object boundary */ + WARN_ON_ONCE(len > PAGE_SIZE); + osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); + dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len); + + req->r_mtime = inode->i_mtime; + err = ceph_osdc_start_request(osdc, req, true); + if (!err) + err = ceph_osdc_wait_request(osdc, req); + + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, err); + + ceph_osdc_put_request(req); + if (err == 0) + err = len; + if (err < 0) { struct writeback_control tmp_wbc; if (!wbc) @@ -737,8 +685,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) end_page_writeback(page); return err; } - if (err == -EBLACKLISTED) - fsc->blacklisted = true; + if (err == -EBLOCKLISTED) + fsc->blocklisted = true; dout("writepage setting page/mapping error %d %p\n", err, page); mapping_set_error(&inode->i_data, err); @@ -801,8 +749,8 @@ static void writepages_finish(struct ceph_osd_request *req) if (rc < 0) { mapping_set_error(mapping, rc); ceph_set_error_write(ci); - if (rc == -EBLACKLISTED) - fsc->blacklisted = true; + if (rc == -EBLOCKLISTED) + fsc->blocklisted = true; } else { ceph_clear_error_write(ci); } @@ -962,9 +910,8 @@ retry: max_pages = wsize >> PAGE_SHIFT; get_more_pages: - pvec_pages = pagevec_lookup_range_nr_tag(&pvec, mapping, &index, - end, PAGECACHE_TAG_DIRTY, - max_pages - locked_pages); + pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, + end, PAGECACHE_TAG_DIRTY); dout("pagevec_lookup_range_tag got %d\n", pvec_pages); if (!pvec_pages && !locked_pages) break; @@ -1299,110 +1246,60 @@ static int context_is_writeable_or_written(struct inode *inode, return ret; } -/* - * We are only allowed to write into/dirty the page if the page is - * clean, or already dirty within the same snap context. +/** + * ceph_find_incompatible - find an incompatible context and return it + * @page: page being dirtied + * + * We are only allowed to write into/dirty a page if the page is + * clean, or already dirty within the same snap context. Returns a + * conflicting context if there is one, NULL if there isn't, or a + * negative error code on other errors. * - * called with page locked. - * return success with page locked, - * or any failure (incl -EAGAIN) with page unlocked. + * Must be called with page lock held. */ -static int ceph_update_writeable_page(struct file *file, - loff_t pos, unsigned len, - struct page *page) +static struct ceph_snap_context * +ceph_find_incompatible(struct page *page) { - struct inode *inode = file_inode(file); + struct inode *inode = page->mapping->host; struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - loff_t page_off = pos & PAGE_MASK; - int pos_in_page = pos & ~PAGE_MASK; - int end_in_page = pos_in_page + len; - loff_t i_size; - int r; - struct ceph_snap_context *snapc, *oldest; if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { dout(" page %p forced umount\n", page); - unlock_page(page); - return -EIO; + return ERR_PTR(-EIO); } -retry_locked: - /* writepages currently holds page lock, but if we change that later, */ - wait_on_page_writeback(page); + for (;;) { + struct ceph_snap_context *snapc, *oldest; + + wait_on_page_writeback(page); + + snapc = page_snap_context(page); + if (!snapc || snapc == ci->i_head_snapc) + break; - snapc = page_snap_context(page); - if (snapc && snapc != ci->i_head_snapc) { /* * this page is already dirty in another (older) snap * context! is it writeable now? */ oldest = get_oldest_context(inode, NULL, NULL); if (snapc->seq > oldest->seq) { + /* not writeable -- return it for the caller to deal with */ ceph_put_snap_context(oldest); - dout(" page %p snapc %p not current or oldest\n", - page, snapc); - /* - * queue for writeback, and wait for snapc to - * be writeable or written - */ - snapc = ceph_get_snap_context(snapc); - unlock_page(page); - ceph_queue_writeback(inode); - r = wait_event_killable(ci->i_cap_wq, - context_is_writeable_or_written(inode, snapc)); - ceph_put_snap_context(snapc); - if (r == -ERESTARTSYS) - return r; - return -EAGAIN; + dout(" page %p snapc %p not current or oldest\n", page, snapc); + return ceph_get_snap_context(snapc); } ceph_put_snap_context(oldest); /* yay, writeable, do it now (without dropping page lock) */ - dout(" page %p snapc %p not current, but oldest\n", - page, snapc); - if (!clear_page_dirty_for_io(page)) - goto retry_locked; - r = writepage_nounlock(page, NULL); - if (r < 0) - goto fail_unlock; - goto retry_locked; - } - - if (PageUptodate(page)) { - dout(" page %p already uptodate\n", page); - return 0; - } - - /* full page? */ - if (pos_in_page == 0 && len == PAGE_SIZE) - return 0; - - /* past end of file? */ - i_size = i_size_read(inode); - - if (page_off >= i_size || - (pos_in_page == 0 && (pos+len) >= i_size && - end_in_page - pos_in_page != PAGE_SIZE)) { - dout(" zeroing %p 0 - %d and %d - %d\n", - page, pos_in_page, end_in_page, (int)PAGE_SIZE); - zero_user_segments(page, - 0, pos_in_page, - end_in_page, PAGE_SIZE); - return 0; - } - - /* we need to read it. */ - r = ceph_do_readpage(file, page); - if (r < 0) { - if (r == -EINPROGRESS) - return -EAGAIN; - goto fail_unlock; + dout(" page %p snapc %p not current, but oldest\n", page, snapc); + if (clear_page_dirty_for_io(page)) { + int r = writepage_nounlock(page, NULL); + if (r < 0) + return ERR_PTR(r); + } } - goto retry_locked; -fail_unlock: - unlock_page(page); - return r; + return NULL; } /* @@ -1414,26 +1311,78 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); - struct page *page; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc; + struct page *page = NULL; pgoff_t index = pos >> PAGE_SHIFT; - int r; + int pos_in_page = pos & ~PAGE_MASK; + int r = 0; - do { - /* get a page */ - page = grab_cache_page_write_begin(mapping, index, 0); - if (!page) - return -ENOMEM; + dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); - dout("write_begin file %p inode %p page %p %d~%d\n", file, - inode, page, (int)pos, (int)len); + for (;;) { + page = grab_cache_page_write_begin(mapping, index, 0); + if (!page) { + r = -ENOMEM; + break; + } - r = ceph_update_writeable_page(file, pos, len, page); - if (r < 0) + snapc = ceph_find_incompatible(page); + if (snapc) { + if (IS_ERR(snapc)) { + r = PTR_ERR(snapc); + break; + } + unlock_page(page); put_page(page); - else - *pagep = page; - } while (r == -EAGAIN); + page = NULL; + ceph_queue_writeback(inode); + r = wait_event_killable(ci->i_cap_wq, + context_is_writeable_or_written(inode, snapc)); + ceph_put_snap_context(snapc); + if (r != 0) + break; + continue; + } + + if (PageUptodate(page)) { + dout(" page %p already uptodate\n", page); + break; + } + + /* + * In some cases we don't need to read at all: + * - full page write + * - write that lies completely beyond EOF + * - write that covers the the page from start to EOF or beyond it + */ + if ((pos_in_page == 0 && len == PAGE_SIZE) || + (pos >= i_size_read(inode)) || + (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) { + zero_user_segments(page, 0, pos_in_page, + pos_in_page + len, PAGE_SIZE); + break; + } + /* + * We need to read it. If we get back -EINPROGRESS, then the page was + * handed off to fscache and it will be unlocked when the read completes. + * Refind the page in that case so we can reacquire the page lock. Otherwise + * we got a hard error or the read was completed synchronously. + */ + r = ceph_do_readpage(file, page); + if (r != -EINPROGRESS) + break; + } + + if (r < 0) { + if (page) { + unlock_page(page); + put_page(page); + } + } else { + *pagep = page; + } return r; } @@ -1522,7 +1471,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; struct page *pinned_page = NULL; - loff_t off = vmf->pgoff << PAGE_SHIFT; + loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT; int want, got, err; sigset_t oldset; vm_fault_t ret = VM_FAULT_SIGBUS; @@ -1668,6 +1617,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) inode_inc_iversion_raw(inode); do { + struct ceph_snap_context *snapc; + lock_page(page); if (page_mkwrite_check_truncate(page, inode) < 0) { @@ -1676,13 +1627,26 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) break; } - err = ceph_update_writeable_page(vma->vm_file, off, len, page); - if (err >= 0) { + snapc = ceph_find_incompatible(page); + if (!snapc) { /* success. we'll keep the page locked. */ set_page_dirty(page); ret = VM_FAULT_LOCKED; + break; } - } while (err == -EAGAIN); + + unlock_page(page); + + if (IS_ERR(snapc)) { + ret = VM_FAULT_SIGBUS; + break; + } + + ceph_queue_writeback(inode); + err = wait_event_killable(ci->i_cap_wq, + context_is_writeable_or_written(inode, snapc)); + ceph_put_snap_context(snapc); + } while (err == 0); if (ret == VM_FAULT_LOCKED || ci->i_inline_version != CEPH_INLINE_NONE) { @@ -2039,16 +2003,16 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, if (err >= 0 || err == -ENOENT) have |= POOL_READ; else if (err != -EPERM) { - if (err == -EBLACKLISTED) - fsc->blacklisted = true; + if (err == -EBLOCKLISTED) + fsc->blocklisted = true; goto out_unlock; } if (err2 == 0 || err2 == -EEXIST) have |= POOL_WRITE; else if (err2 != -EPERM) { - if (err2 == -EBLACKLISTED) - fsc->blacklisted = true; + if (err2 == -EBLOCKLISTED) + fsc->blocklisted = true; err = err2; goto out_unlock; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 034b3f4fdd3a..5027bbdca419 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1222,36 +1222,27 @@ struct cap_msg_args { }; /* - * Build and send a cap message to the given MDS. - * - * Caller should be holding s_mutex. + * cap struct size + flock buffer size + inline version + inline data size + + * osd_epoch_barrier + oldest_flush_tid */ -static int send_cap_msg(struct cap_msg_args *arg) +#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \ + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4) + +/* Marshal up the cap msg to the MDS */ +static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) { struct ceph_mds_caps *fc; - struct ceph_msg *msg; void *p; - size_t extra_len; struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc; - dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" - " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu" - " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op), - arg->cid, arg->ino, ceph_cap_string(arg->caps), - ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty), - arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid, - arg->mseq, arg->follows, arg->size, arg->max_size, - arg->xattr_version, + dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n", + __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino, + ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted), + ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq, + arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows, + arg->size, arg->max_size, arg->xattr_version, arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); - /* flock buffer size + inline version + inline data size + - * osd_epoch_barrier + oldest_flush_tid */ - extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4; - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, - GFP_NOFS, false); - if (!msg) - return -ENOMEM; - msg->hdr.version = cpu_to_le16(10); msg->hdr.tid = cpu_to_le64(arg->flush_tid); @@ -1323,9 +1314,6 @@ static int send_cap_msg(struct cap_msg_args *arg) /* Advisory flags (version 10) */ ceph_encode_32(&p, arg->flags); - - ceph_con_send(&arg->session->s_con, msg); - return 0; } /* @@ -1454,25 +1442,25 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, * * Caller should hold snap_rwsem (read), s_mutex. */ -static void __send_cap(struct ceph_mds_client *mdsc, struct cap_msg_args *arg, - struct ceph_inode_info *ci) +static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) { + struct ceph_msg *msg; struct inode *inode = &ci->vfs_inode; - int ret; - ret = send_cap_msg(arg); - if (ret < 0) { - pr_err("error sending cap msg, ino (%llx.%llx) " - "flushing %s tid %llu, requeue\n", + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); + if (!msg) { + pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n", ceph_vinop(inode), ceph_cap_string(arg->dirty), arg->flush_tid); spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); + __cap_delay_requeue(arg->session->s_mdsc, ci); spin_unlock(&ci->i_ceph_lock); + return; } + encode_cap_msg(msg, arg); + ceph_con_send(&arg->session->s_con, msg); ceph_buffer_put(arg->old_xattr_buf); - if (arg->wake) wake_up_all(&ci->i_cap_wq); } @@ -1483,6 +1471,11 @@ static inline int __send_flush_snap(struct inode *inode, u32 mseq, u64 oldest_flush_tid) { struct cap_msg_args arg; + struct ceph_msg *msg; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); + if (!msg) + return -ENOMEM; arg.session = session; arg.ino = ceph_vino(inode).ino; @@ -1521,7 +1514,9 @@ static inline int __send_flush_snap(struct inode *inode, arg.flags = 0; arg.wake = false; - return send_cap_msg(&arg); + encode_cap_msg(msg, &arg); + ceph_con_send(&arg.session->s_con, msg); + return 0; } /* @@ -1906,9 +1901,8 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci) void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session) { - struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); - struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_cap *cap; u64 flush_tid, oldest_flush_tid; int file_wanted, used, cap_used; @@ -1928,12 +1922,24 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, retry: spin_lock(&ci->i_ceph_lock); retry_locked: + /* Caps wanted by virtue of active open files. */ file_wanted = __ceph_caps_file_wanted(ci); + + /* Caps which have active references against them */ used = __ceph_caps_used(ci); + + /* + * "issued" represents the current caps that the MDS wants us to have. + * "implemented" is the set that we have been granted, and includes the + * ones that have not yet been returned to the MDS (the "revoking" set, + * usually because they have outstanding references). + */ issued = __ceph_caps_issued(ci, &implemented); revoking = implemented & ~issued; want = file_wanted; + + /* The ones we currently want to retain (may be adjusted below) */ retain = file_wanted | used | CEPH_CAP_PIN; if (!mdsc->stopping && inode->i_nlink > 0) { if (file_wanted) { @@ -2011,6 +2017,10 @@ retry_locked: /* NOTE: no side-effects allowed, until we take s_mutex */ + /* + * If we have an auth cap, we don't need to consider any + * overlapping caps as used. + */ cap_used = used; if (ci->i_auth_cap && cap != ci->i_auth_cap) cap_used &= ~ci->i_auth_cap->issued; @@ -2148,7 +2158,7 @@ ack: want, retain, flushing, flush_tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); - __send_cap(mdsc, &arg, ci); + __send_cap(&arg, ci); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } @@ -2222,7 +2232,7 @@ retry_locked: flushing, flush_tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); - __send_cap(mdsc, &arg, ci); + __send_cap(&arg, ci); } else { if (!list_empty(&ci->i_cap_flush_list)) { struct ceph_cap_flush *cf = @@ -2436,7 +2446,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, (cap->issued | cap->implemented), cf->caps, cf->tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); - __send_cap(mdsc, &arg, ci); + __send_cap(&arg, ci); } else { struct ceph_cap_snap *capsnap = container_of(cf, struct ceph_cap_snap, @@ -4284,13 +4294,30 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci, void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) { - int i; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); int bits = (fmode << 1) | 1; + bool is_opened = false; + int i; + + if (count == 1) + atomic64_inc(&mdsc->metric.opened_files); + spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { if (bits & (1 << i)) ci->i_nr_by_mode[i] += count; + + /* + * If any of the mode ref is larger than 1, + * that means it has been already opened by + * others. Just skip checking the PIN ref. + */ + if (i && ci->i_nr_by_mode[i] > 1) + is_opened = true; } + + if (!is_opened) + percpu_counter_inc(&mdsc->metric.opened_inodes); spin_unlock(&ci->i_ceph_lock); } @@ -4301,15 +4328,32 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) */ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) { - int i; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); int bits = (fmode << 1) | 1; + bool is_closed = true; + int i; + + if (count == 1) + atomic64_dec(&mdsc->metric.opened_files); + spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { if (bits & (1 << i)) { BUG_ON(ci->i_nr_by_mode[i] < count); ci->i_nr_by_mode[i] -= count; } + + /* + * If any of the mode ref is not 0 after + * decreased, that means it is still opened + * by others. Just skip checking the PIN ref. + */ + if (i && ci->i_nr_by_mode[i]) + is_closed = false; } + + if (is_closed) + percpu_counter_dec(&mdsc->metric.opened_inodes); spin_unlock(&ci->i_ceph_lock); } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 3e3fcda9b276..7a8fbe3e4751 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -148,6 +148,17 @@ static int metric_show(struct seq_file *s, void *p) int nr_caps = 0; s64 total, sum, avg, min, max, sq; + sum = percpu_counter_sum(&m->total_inodes); + seq_printf(s, "item total\n"); + seq_printf(s, "------------------------------------------\n"); + seq_printf(s, "%-35s%lld / %lld\n", "opened files / total inodes", + atomic64_read(&m->opened_files), sum); + seq_printf(s, "%-35s%lld / %lld\n", "pinned i_caps / total inodes", + atomic64_read(&m->total_caps), sum); + seq_printf(s, "%-35s%lld / %lld\n", "opened inodes / total inodes", + percpu_counter_sum(&m->opened_inodes), sum); + + seq_printf(s, "\n"); seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); seq_printf(s, "-----------------------------------------------------------------------------------\n"); @@ -202,7 +213,8 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p) { struct seq_file *s = p; - seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode), + seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode), + cap->session->s_mds, ceph_cap_string(cap->issued), ceph_cap_string(cap->implemented)); return 0; @@ -222,8 +234,8 @@ static int caps_show(struct seq_file *s, void *p) "reserved\t%d\n" "min\t\t%d\n\n", total, avail, used, reserved, min); - seq_printf(s, "ino issued implemented\n"); - seq_printf(s, "-----------------------------------------------\n"); + seq_printf(s, "ino mds issued implemented\n"); + seq_printf(s, "--------------------------------------------------\n"); mutex_lock(&mdsc->mutex); for (i = 0; i < mdsc->max_sessions; i++) { diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index d72e4a12bb69..a4d48370b2b3 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -38,8 +38,7 @@ static int __dir_lease_try_check(const struct dentry *dentry); static int ceph_d_init(struct dentry *dentry) { struct ceph_dentry_info *di; - struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb); di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL); if (!di) @@ -738,7 +737,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; int op; int mask; @@ -827,8 +826,7 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) static int ceph_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; int err; @@ -889,8 +887,7 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, static int ceph_symlink(struct inode *dir, struct dentry *dentry, const char *dest) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; int err; @@ -942,8 +939,7 @@ out: static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; int err = -EROFS; @@ -1010,8 +1006,7 @@ out: static int ceph_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; int err; @@ -1192,8 +1187,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb); struct ceph_mds_request *req; int op = CEPH_MDS_OP_RENAME; int err; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3f4c993dfc6f..209535d5b8d3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -182,8 +182,7 @@ static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty) static struct ceph_mds_request * prepare_open_request(struct super_block *sb, int flags, int create_mode) { - struct ceph_fs_client *fsc = ceph_sb_to_client(sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb); struct ceph_mds_request *req; int want_auth = USE_ANY_MDS; int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; @@ -256,8 +255,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) case S_IFDIR: ret = ceph_init_file_info(inode, file, fmode, S_ISDIR(inode->i_mode)); - if (ret) - return ret; break; case S_IFLNK: @@ -285,7 +282,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) */ int ceph_renew_caps(struct inode *inode, int fmode) { - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req; int err, flags, wanted; @@ -865,6 +862,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, size_t page_off; u64 i_size; bool more; + int idx; + size_t left; req = ceph_osdc_new_request(osdc, &ci->i_layout, ci->i_vino, off, &len, 0, 1, @@ -878,29 +877,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, more = len < iov_iter_count(to); - if (unlikely(iov_iter_is_pipe(to))) { - ret = iov_iter_get_pages_alloc(to, &pages, len, - &page_off); - if (ret <= 0) { - ceph_osdc_put_request(req); - ret = -ENOMEM; - break; - } - num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE); - if (ret < len) { - len = ret; - osd_req_op_extent_update(req, 0, len); - more = false; - } - } else { - num_pages = calc_pages_for(off, len); - page_off = off & ~PAGE_MASK; - pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); - if (IS_ERR(pages)) { - ceph_osdc_put_request(req); - ret = PTR_ERR(pages); - break; - } + num_pages = calc_pages_for(off, len); + page_off = off & ~PAGE_MASK; + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) { + ceph_osdc_put_request(req); + ret = PTR_ERR(pages); + break; } osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, @@ -931,36 +914,27 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ret += zlen; } - if (unlikely(iov_iter_is_pipe(to))) { - if (ret > 0) { - iov_iter_advance(to, ret); - off += ret; - } else { - iov_iter_advance(to, 0); - } - ceph_put_page_vector(pages, num_pages, false); - } else { - int idx = 0; - size_t left = ret > 0 ? ret : 0; - while (left > 0) { - size_t len, copied; - page_off = off & ~PAGE_MASK; - len = min_t(size_t, left, PAGE_SIZE - page_off); - copied = copy_page_to_iter(pages[idx++], - page_off, len, to); - off += copied; - left -= copied; - if (copied < len) { - ret = -EFAULT; - break; - } + idx = 0; + left = ret > 0 ? ret : 0; + while (left > 0) { + size_t len, copied; + page_off = off & ~PAGE_MASK; + len = min_t(size_t, left, PAGE_SIZE - page_off); + SetPageUptodate(pages[idx]); + copied = copy_page_to_iter(pages[idx++], + page_off, len, to); + off += copied; + left -= copied; + if (copied < len) { + ret = -EFAULT; + break; } - ceph_release_page_vector(pages, num_pages); } + ceph_release_page_vector(pages, num_pages); if (ret < 0) { - if (ret == -EBLACKLISTED) - fsc->blacklisted = true; + if (ret == -EBLOCKLISTED) + fsc->blocklisted = true; break; } @@ -1052,8 +1026,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) struct inode *inode = req->r_inode; struct ceph_aio_request *aio_req = req->r_priv; struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_client_metric *metric = &fsc->mdsc->metric; + struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric; BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); BUG_ON(!osd_data->num_bvecs); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index d163fa96cb40..526faf4778ce 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -42,10 +42,13 @@ static void ceph_inode_work(struct work_struct *work); static int ceph_set_ino_cb(struct inode *inode, void *data) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); ci->i_vino = *(struct ceph_vino *)data; inode->i_ino = ceph_vino_to_ino_t(ci->i_vino); inode_set_iversion_raw(inode, 0); + percpu_counter_inc(&mdsc->metric.total_inodes); + return 0; } @@ -538,11 +541,14 @@ void ceph_free_inode(struct inode *inode) void ceph_evict_inode(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_inode_frag *frag; struct rb_node *n; dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); + percpu_counter_dec(&mdsc->metric.total_inodes); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); @@ -558,8 +564,6 @@ void ceph_evict_inode(struct inode *inode) * caps in i_snap_caps. */ if (ci->i_snap_realm) { - struct ceph_mds_client *mdsc = - ceph_inode_to_client(inode)->mdsc; if (ceph_snap(inode) == CEPH_NOSNAP) { struct ceph_snap_realm *realm = ci->i_snap_realm; dout(" dropping residual ref to snap realm %p\n", @@ -739,7 +743,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, struct ceph_mds_session *session, int cap_fmode, struct ceph_cap_reservation *caps_reservation) { - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_mds_reply_inode *info = iinfo->in; struct ceph_inode_info *ci = ceph_inode(inode); int issued, new_issued, info_caps; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index d6b9166e71e4..048a435a29be 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -63,7 +63,7 @@ static const struct file_lock_operations ceph_fl_lock_ops = { static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, int cmd, u8 wait, struct file_lock *fl) { - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_mds_request *req; int err; u64 length = 0; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4a26862d7667..08f1c0c31dc2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3303,7 +3303,7 @@ bad: } static int __decode_session_metadata(void **p, void *end, - bool *blacklisted) + bool *blocklisted) { /* map<string,string> */ u32 n; @@ -3317,8 +3317,12 @@ static int __decode_session_metadata(void **p, void *end, *p += len; ceph_decode_32_safe(p, end, len, bad); ceph_decode_need(p, end, len, bad); + /* + * Match "blocklisted (blacklisted)" from newer MDSes, + * or "blacklisted" from older MDSes. + */ if (err_str && strnstr(*p, "blacklisted", len)) - *blacklisted = true; + *blocklisted = true; *p += len; } return 0; @@ -3341,7 +3345,7 @@ static void handle_session(struct ceph_mds_session *session, u32 op; u64 seq, features = 0; int wake = 0; - bool blacklisted = false; + bool blocklisted = false; /* decode */ ceph_decode_need(&p, end, sizeof(*h), bad); @@ -3354,7 +3358,7 @@ static void handle_session(struct ceph_mds_session *session, if (msg_version >= 3) { u32 len; /* version >= 2, metadata */ - if (__decode_session_metadata(&p, end, &blacklisted) < 0) + if (__decode_session_metadata(&p, end, &blocklisted) < 0) goto bad; /* version >= 3, feature bits */ ceph_decode_32_safe(&p, end, len, bad); @@ -3445,8 +3449,8 @@ static void handle_session(struct ceph_mds_session *session, session->s_state = CEPH_MDS_SESSION_REJECTED; cleanup_session_requests(mdsc, session); remove_session_caps(session); - if (blacklisted) - mdsc->fsc->blacklisted = true; + if (blocklisted) + mdsc->fsc->blocklisted = true; wake = 2; /* for good measure */ break; @@ -3612,6 +3616,39 @@ fail_msg: return err; } +static struct dentry* d_find_primary(struct inode *inode) +{ + struct dentry *alias, *dn = NULL; + + if (hlist_empty(&inode->i_dentry)) + return NULL; + + spin_lock(&inode->i_lock); + if (hlist_empty(&inode->i_dentry)) + goto out_unlock; + + if (S_ISDIR(inode->i_mode)) { + alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); + if (!IS_ROOT(alias)) + dn = dget(alias); + goto out_unlock; + } + + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + spin_lock(&alias->d_lock); + if (!d_unhashed(alias) && + (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) { + dn = dget_dlock(alias); + } + spin_unlock(&alias->d_lock); + if (dn) + break; + } +out_unlock: + spin_unlock(&inode->i_lock); + return dn; +} + /* * Encode information about a cap for a reconnect with the MDS. */ @@ -3625,13 +3662,32 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_inode_info *ci = cap->ci; struct ceph_reconnect_state *recon_state = arg; struct ceph_pagelist *pagelist = recon_state->pagelist; - int err; + struct dentry *dentry; + char *path; + int pathlen, err; + u64 pathbase; u64 snap_follows; dout(" adding %p ino %llx.%llx cap %p %lld %s\n", inode, ceph_vinop(inode), cap, cap->cap_id, ceph_cap_string(cap->issued)); + dentry = d_find_primary(inode); + if (dentry) { + /* set pathbase to parent dir when msg_version >= 2 */ + path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, + recon_state->msg_version >= 2); + dput(dentry); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out_err; + } + } else { + path = NULL; + pathlen = 0; + pathbase = 0; + } + spin_lock(&ci->i_ceph_lock); cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ @@ -3652,7 +3708,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v2.issued = cpu_to_le32(cap->issued); rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); - rec.v2.pathbase = 0; + rec.v2.pathbase = cpu_to_le64(pathbase); rec.v2.flock_len = (__force __le32) ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); } else { @@ -3663,7 +3719,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime); ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); - rec.v1.pathbase = 0; + rec.v1.pathbase = cpu_to_le64(pathbase); } if (list_empty(&ci->i_cap_snaps)) { @@ -3725,7 +3781,7 @@ encode_again: sizeof(struct ceph_filelock); rec.v2.flock_len = cpu_to_le32(struct_len); - struct_len += sizeof(u32) + sizeof(rec.v2); + struct_len += sizeof(u32) + pathlen + sizeof(rec.v2); if (struct_v >= 2) struct_len += sizeof(u64); /* snap_follows */ @@ -3749,7 +3805,7 @@ encode_again: ceph_pagelist_encode_8(pagelist, 1); ceph_pagelist_encode_32(pagelist, struct_len); } - ceph_pagelist_encode_string(pagelist, NULL, 0); + ceph_pagelist_encode_string(pagelist, path, pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2)); ceph_locks_to_pagelist(flocks, pagelist, num_fcntl_locks, num_flock_locks); @@ -3758,39 +3814,20 @@ encode_again: out_freeflocks: kfree(flocks); } else { - u64 pathbase = 0; - int pathlen = 0; - char *path = NULL; - struct dentry *dentry; - - dentry = d_find_alias(inode); - if (dentry) { - path = ceph_mdsc_build_path(dentry, - &pathlen, &pathbase, 0); - dput(dentry); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out_err; - } - rec.v1.pathbase = cpu_to_le64(pathbase); - } - err = ceph_pagelist_reserve(pagelist, sizeof(u64) + sizeof(u32) + pathlen + sizeof(rec.v1)); - if (err) { - goto out_freepath; - } + if (err) + goto out_err; ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); ceph_pagelist_encode_string(pagelist, path, pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); -out_freepath: - ceph_mdsc_free_path(path, pathlen); } out_err: - if (err >= 0) + ceph_mdsc_free_path(path, pathlen); + if (!err) recon_state->nr_caps++; return err; } @@ -4334,14 +4371,14 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED) return; - if (!READ_ONCE(fsc->blacklisted)) + if (!READ_ONCE(fsc->blocklisted)) return; if (fsc->last_auto_reconnect && time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30)) return; - pr_info("auto reconnect after blacklisted\n"); + pr_info("auto reconnect after blocklisted\n"); fsc->last_auto_reconnect = jiffies; ceph_force_reconnect(fsc->sb); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 658800605bfb..cbf8af437140 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -393,7 +393,7 @@ struct ceph_mds_client { struct ceph_mds_session **sessions; /* NULL for mds if no session */ atomic_t num_sessions; - int max_sessions; /* len of s_mds_sessions */ + int max_sessions; /* len of sessions array */ int stopping; /* true if shutting down */ atomic64_t quotarealms_count; /* # realms with quota */ diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 2466b261fba2..fee4c4778313 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -192,11 +192,23 @@ int ceph_metric_init(struct ceph_client_metric *m) m->total_metadatas = 0; m->metadata_latency_sum = 0; + atomic64_set(&m->opened_files, 0); + ret = percpu_counter_init(&m->opened_inodes, 0, GFP_KERNEL); + if (ret) + goto err_opened_inodes; + ret = percpu_counter_init(&m->total_inodes, 0, GFP_KERNEL); + if (ret) + goto err_total_inodes; + m->session = NULL; INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work); return 0; +err_total_inodes: + percpu_counter_destroy(&m->opened_inodes); +err_opened_inodes: + percpu_counter_destroy(&m->i_caps_mis); err_i_caps_mis: percpu_counter_destroy(&m->i_caps_hit); err_i_caps_hit: @@ -212,6 +224,8 @@ void ceph_metric_destroy(struct ceph_client_metric *m) if (!m) return; + percpu_counter_destroy(&m->total_inodes); + percpu_counter_destroy(&m->opened_inodes); percpu_counter_destroy(&m->i_caps_mis); percpu_counter_destroy(&m->i_caps_hit); percpu_counter_destroy(&m->d_lease_mis); diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index 1d0959d669d7..710f3f1dceab 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -115,6 +115,13 @@ struct ceph_client_metric { ktime_t metadata_latency_min; ktime_t metadata_latency_max; + /* The total number of directories and files that are opened */ + atomic64_t opened_files; + + /* The total number of inodes that have opened files or directories */ + struct percpu_counter opened_inodes; + struct percpu_counter total_inodes; + struct ceph_mds_session *session; struct delayed_work delayed_work; /* delayed work */ }; diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index cc2c4d40b022..83cb4f26b689 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -12,7 +12,7 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc) { - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); if (inc) atomic64_inc(&mdsc->quotarealms_count); else @@ -21,8 +21,8 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc) static inline bool ceph_has_realms_with_quotas(struct inode *inode) { - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; - struct super_block *sb = mdsc->fsc->sb; + struct super_block *sb = inode->i_sb; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb); struct inode *root = d_inode(sb->s_root); if (atomic64_read(&mdsc->quotarealms_count) > 0) @@ -266,7 +266,7 @@ restart: static bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) { - struct ceph_mds_client *mdsc = ceph_inode_to_client(old)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb); struct ceph_snap_realm *old_realm, *new_realm; bool is_same; @@ -313,7 +313,7 @@ enum quota_check_op { static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op, loff_t delta) { - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_inode_info *ci; struct ceph_snap_realm *realm, *next; struct inode *in; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 923be9399b21..0da39c16dab4 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -602,7 +602,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { struct inode *inode = &ci->vfs_inode; - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); BUG_ON(capsnap->writing); capsnap->size = inode->i_size; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 7ec0e6d03d10..33ba6f0aa55c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -104,8 +104,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) le64_to_cpu(*((__le64 *)&monc->monmap->fsid + 1)); mutex_unlock(&monc->mutex); - buf->f_fsid.val[0] = fsid & 0xffffffff; - buf->f_fsid.val[1] = fsid >> 32; + buf->f_fsid = u64_to_fsid(fsid); return 0; } @@ -1205,14 +1204,13 @@ nomem: static void ceph_kill_sb(struct super_block *s) { struct ceph_fs_client *fsc = ceph_sb_to_client(s); - dev_t dev = s->s_dev; dout("kill_sb %p\n", s); ceph_mdsc_pre_umount(fsc->mdsc); flush_fs_workqueues(fsc); - generic_shutdown_super(s); + kill_anon_super(s); fsc->client->extra_mon_dispatch = NULL; ceph_fs_debugfs_cleanup(fsc); @@ -1220,7 +1218,6 @@ static void ceph_kill_sb(struct super_block *s) ceph_fscache_unregister_fs(fsc); destroy_fs_client(fsc); - free_anon_bdev(dev); } static struct file_system_type ceph_fs_type = { @@ -1243,13 +1240,13 @@ int ceph_force_reconnect(struct super_block *sb) * see remove_session_caps_cb() */ flush_workqueue(fsc->inode_wq); - /* In case that we were blacklisted. This also reset + /* In case that we were blocklisted. This also reset * all mon/osd connections */ ceph_reset_client_addr(fsc->client); ceph_osdc_clear_abort_err(&fsc->client->osdc); - fsc->blacklisted = false; + fsc->blocklisted = false; fsc->mount_state = CEPH_MOUNT_MOUNTED; if (sb->s_root) { diff --git a/fs/ceph/super.h b/fs/ceph/super.h index a3995ebe0623..482473e4cce1 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -32,7 +32,7 @@ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) -#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */ +#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ @@ -109,7 +109,7 @@ struct ceph_fs_client { unsigned long mount_state; unsigned long last_auto_reconnect; - bool blacklisted; + bool blocklisted; bool have_copy_from2; @@ -160,7 +160,8 @@ struct ceph_cap { int issued; /* latest, from the mds */ int implemented; /* implemented superset of issued (for revocation) */ - int mds, mds_wanted; + int mds; /* mds index for this cap */ + int mds_wanted; /* caps wanted from this mds */ }; /* caps to release */ struct { @@ -451,6 +452,12 @@ ceph_sb_to_client(const struct super_block *sb) return (struct ceph_fs_client *)sb->s_fs_info; } +static inline struct ceph_mds_client * +ceph_sb_to_mdsc(const struct super_block *sb) +{ + return (struct ceph_mds_client *)ceph_sb_to_client(sb)->mdsc; +} + static inline struct ceph_vino ceph_vino(const struct inode *inode) { diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 3a733ac33d9b..197cb1234341 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -116,7 +116,8 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, * NULL terminates however, so call it on a temporary buffer and then memcpy * the result into place. */ -static int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...) +static __printf(3, 4) +int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...) { int ret; va_list args; diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 51bae9340842..cd17d0e50f2a 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -10,7 +10,7 @@ cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \ cifs_unicode.o nterr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o \ smb2ops.o smb2maperror.o smb2transport.o \ - smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o + smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o cifs-$(CONFIG_CIFS_XATTR) += xattr.o diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index 689162e2e175..3150c19cdc2f 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -530,8 +530,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, return 0; } else if ((cls != ASN1_CTX) || (con != ASN1_CON) || (tag != ASN1_EOC)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n", + cls, con, tag, end); return 0; } @@ -541,8 +541,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, return 0; } else if ((cls != ASN1_UNI) || (con != ASN1_CON) || (tag != ASN1_SEQ)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 1\n", + cls, con, tag, end); return 0; } @@ -552,8 +552,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, return 0; } else if ((cls != ASN1_CTX) || (con != ASN1_CON) || (tag != ASN1_EOC)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n", + cls, con, tag, end); return 0; } @@ -564,8 +564,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, return 0; } else if ((cls != ASN1_UNI) || (con != ASN1_CON) || (tag != ASN1_SEQ)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d sequence_end = %p exit 1\n", + cls, con, tag, sequence_end); return 0; } diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 498777d859eb..9bd03a231032 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -488,7 +488,13 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, else if (map_chars == SFM_MAP_UNI_RSVD) { bool end_of_string; - if (i == srclen - 1) + /** + * Remap spaces and periods found at the end of every + * component of the path. The special cases of '.' and + * '..' do not need to be dealt with explicitly because + * they are addressed in namei.c:link_path_walk(). + **/ + if ((i == srclen - 1) || (source[i+1] == '\\')) end_of_string = true; else end_of_string = false; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index fcff14ef1c70..23b21e943652 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -338,7 +338,7 @@ invalidate_key: goto out_key_put; } -static int +int sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, struct cifs_fattr *fattr, uint sidtype) { @@ -359,7 +359,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, return -EIO; } - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) { + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) || + (cifs_sb_master_tcon(cifs_sb)->posix_extensions)) { uint32_t unix_id; bool is_group; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 0fb99d25e8a8..472cb7777e3e 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -71,6 +71,8 @@ bool enable_oplocks = true; bool linuxExtEnabled = true; bool lookupCacheEnabled = true; bool disable_legacy_dialects; /* false by default */ +bool enable_gcm_256; /* false by default, change when more servers support it */ +bool require_gcm_256; /* false by default */ unsigned int global_secflags = CIFSSEC_DEF; /* unsigned int ntlmv2_support = 0; */ unsigned int sign_CIFS_PDUs = 1; @@ -104,6 +106,12 @@ MODULE_PARM_DESC(slow_rsp_threshold, "Amount of time (in seconds) to wait " module_param(enable_oplocks, bool, 0644); MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1"); +module_param(enable_gcm_256, bool, 0644); +MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: n/N/0"); + +module_param(require_gcm_256, bool, 0644); +MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0"); + module_param(disable_legacy_dialects, bool, 0644); MODULE_PARM_DESC(disable_legacy_dialects, "To improve security it may be " "helpful to restrict the ability to " diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 99b3180c613a..905d03863721 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -156,5 +156,5 @@ extern int cifs_truncate_page(struct address_space *mapping, loff_t from); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.28" +#define CIFS_VERSION "2.29" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index b565d83ba89e..484ec2d8c5c9 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -195,18 +195,6 @@ struct smb_rqst { unsigned int rq_tailsz; /* length of last page */ }; -enum smb_version { - Smb_1 = 1, - Smb_20, - Smb_21, - Smb_30, - Smb_302, - Smb_311, - Smb_3any, - Smb_default, - Smb_version_err -}; - struct mid_q_entry; struct TCP_Server_Info; struct cifsFileInfo; @@ -310,6 +298,10 @@ struct smb_version_operations { /* query file data from the server */ int (*query_file_info)(const unsigned int, struct cifs_tcon *, struct cifs_fid *, FILE_ALL_INFO *); + /* query reparse tag from srv to determine which type of special file */ + int (*query_reparse_tag)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *path, + __u32 *reparse_tag); /* get server index number */ int (*get_srv_inum)(const unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const char *, @@ -510,6 +502,8 @@ struct smb_version_operations { struct fiemap_extent_info *, u64, u64); /* version specific llseek implementation */ loff_t (*llseek)(struct file *, struct cifs_tcon *, loff_t, int); + /* Check for STATUS_IO_TIMEOUT */ + bool (*is_status_io_timeout)(char *buf); }; struct smb_version_values { @@ -1954,6 +1948,8 @@ extern bool lookupCacheEnabled; extern unsigned int global_secflags; /* if on, session setup sent with more secure ntlmssp2 challenge/resp */ extern unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ +extern bool enable_gcm_256; /* allow optional negotiate of strongest signing (aes-gcm-256) */ +extern bool require_gcm_256; /* require use of strongest signing (aes-gcm-256) */ extern bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ extern unsigned int CIFSMaxBufSize; /* max size not including hdr */ extern unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index bb68cbf81074..24c6f36177ba 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -209,6 +209,8 @@ extern int cifs_set_file_info(struct inode *inode, struct iattr *attrs, extern int cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, const unsigned int xid); +extern int sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, + struct cifs_fattr *fattr, uint sidtype); extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, bool get_mode_from_special_sid, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a5731dd6e656..c38156f324dd 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -61,6 +61,7 @@ #ifdef CONFIG_CIFS_DFS_UPCALL #include "dfs_cache.h" #endif +#include "fs_context.h" extern mempool_t *cifs_req_poolp; extern bool disable_legacy_dialects; @@ -69,6 +70,9 @@ extern bool disable_legacy_dialects; #define TLINK_ERROR_EXPIRE (1 * HZ) #define TLINK_IDLE_EXPIRE (600 * HZ) +/* Drop the connection to not overload the server */ +#define NUM_STATUS_IO_TIMEOUT 5 + enum { /* Mount options that take no arguments */ Opt_user_xattr, Opt_nouser_xattr, @@ -276,66 +280,6 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_err, NULL } }; -enum { - Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p, - Opt_sec_ntlmsspi, Opt_sec_ntlmssp, - Opt_ntlm, Opt_sec_ntlmi, Opt_sec_ntlmv2, - Opt_sec_ntlmv2i, Opt_sec_lanman, - Opt_sec_none, - - Opt_sec_err -}; - -static const match_table_t cifs_secflavor_tokens = { - { Opt_sec_krb5, "krb5" }, - { Opt_sec_krb5i, "krb5i" }, - { Opt_sec_krb5p, "krb5p" }, - { Opt_sec_ntlmsspi, "ntlmsspi" }, - { Opt_sec_ntlmssp, "ntlmssp" }, - { Opt_ntlm, "ntlm" }, - { Opt_sec_ntlmi, "ntlmi" }, - { Opt_sec_ntlmv2, "nontlm" }, - { Opt_sec_ntlmv2, "ntlmv2" }, - { Opt_sec_ntlmv2i, "ntlmv2i" }, - { Opt_sec_lanman, "lanman" }, - { Opt_sec_none, "none" }, - - { Opt_sec_err, NULL } -}; - -/* cache flavors */ -enum { - Opt_cache_loose, - Opt_cache_strict, - Opt_cache_none, - Opt_cache_ro, - Opt_cache_rw, - Opt_cache_err -}; - -static const match_table_t cifs_cacheflavor_tokens = { - { Opt_cache_loose, "loose" }, - { Opt_cache_strict, "strict" }, - { Opt_cache_none, "none" }, - { Opt_cache_ro, "ro" }, - { Opt_cache_rw, "singleclient" }, - { Opt_cache_err, NULL } -}; - -static const match_table_t cifs_smb_version_tokens = { - { Smb_1, SMB1_VERSION_STRING }, - { Smb_20, SMB20_VERSION_STRING}, - { Smb_21, SMB21_VERSION_STRING }, - { Smb_30, SMB30_VERSION_STRING }, - { Smb_302, SMB302_VERSION_STRING }, - { Smb_302, ALT_SMB302_VERSION_STRING }, - { Smb_311, SMB311_VERSION_STRING }, - { Smb_311, ALT_SMB311_VERSION_STRING }, - { Smb_3any, SMB3ANY_VERSION_STRING }, - { Smb_default, SMBDEFAULT_VERSION_STRING }, - { Smb_version_err, NULL } -}; - static int ip_connect(struct TCP_Server_Info *server); static int generic_ip_connect(struct TCP_Server_Info *server); static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); @@ -1117,7 +1061,7 @@ cifs_demultiplex_thread(void *p) struct task_struct *task_to_wake = NULL; struct mid_q_entry *mids[MAX_COMPOUND]; char *bufs[MAX_COMPOUND]; - unsigned int noreclaim_flag; + unsigned int noreclaim_flag, num_io_timeout = 0; noreclaim_flag = memalloc_noreclaim_save(); cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current)); @@ -1213,6 +1157,16 @@ next_pdu: continue; } + if (server->ops->is_status_io_timeout && + server->ops->is_status_io_timeout(buf)) { + num_io_timeout++; + if (num_io_timeout > NUM_STATUS_IO_TIMEOUT) { + cifs_reconnect(server); + num_io_timeout = 0; + continue; + } + } + server->lstrp = jiffies; for (i = 0; i < num_mids; i++) { @@ -1359,177 +1313,6 @@ static int get_option_gid(substring_t args[], kgid_t *result) return 0; } -static int cifs_parse_security_flavors(char *value, - struct smb_vol *vol) -{ - - substring_t args[MAX_OPT_ARGS]; - - /* - * With mount options, the last one should win. Reset any existing - * settings back to default. - */ - vol->sectype = Unspecified; - vol->sign = false; - - switch (match_token(value, cifs_secflavor_tokens, args)) { - case Opt_sec_krb5p: - cifs_dbg(VFS, "sec=krb5p is not supported!\n"); - return 1; - case Opt_sec_krb5i: - vol->sign = true; - fallthrough; - case Opt_sec_krb5: - vol->sectype = Kerberos; - break; - case Opt_sec_ntlmsspi: - vol->sign = true; - fallthrough; - case Opt_sec_ntlmssp: - vol->sectype = RawNTLMSSP; - break; - case Opt_sec_ntlmi: - vol->sign = true; - fallthrough; - case Opt_ntlm: - vol->sectype = NTLM; - break; - case Opt_sec_ntlmv2i: - vol->sign = true; - fallthrough; - case Opt_sec_ntlmv2: - vol->sectype = NTLMv2; - break; -#ifdef CONFIG_CIFS_WEAK_PW_HASH - case Opt_sec_lanman: - vol->sectype = LANMAN; - break; -#endif - case Opt_sec_none: - vol->nullauth = 1; - break; - default: - cifs_dbg(VFS, "bad security option: %s\n", value); - return 1; - } - - return 0; -} - -static int -cifs_parse_cache_flavor(char *value, struct smb_vol *vol) -{ - substring_t args[MAX_OPT_ARGS]; - - switch (match_token(value, cifs_cacheflavor_tokens, args)) { - case Opt_cache_loose: - vol->direct_io = false; - vol->strict_io = false; - vol->cache_ro = false; - vol->cache_rw = false; - break; - case Opt_cache_strict: - vol->direct_io = false; - vol->strict_io = true; - vol->cache_ro = false; - vol->cache_rw = false; - break; - case Opt_cache_none: - vol->direct_io = true; - vol->strict_io = false; - vol->cache_ro = false; - vol->cache_rw = false; - break; - case Opt_cache_ro: - vol->direct_io = false; - vol->strict_io = false; - vol->cache_ro = true; - vol->cache_rw = false; - break; - case Opt_cache_rw: - vol->direct_io = false; - vol->strict_io = false; - vol->cache_ro = false; - vol->cache_rw = true; - break; - default: - cifs_dbg(VFS, "bad cache= option: %s\n", value); - return 1; - } - return 0; -} - -static int -cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3) -{ - substring_t args[MAX_OPT_ARGS]; - - switch (match_token(value, cifs_smb_version_tokens, args)) { -#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY - case Smb_1: - if (disable_legacy_dialects) { - cifs_dbg(VFS, "mount with legacy dialect disabled\n"); - return 1; - } - if (is_smb3) { - cifs_dbg(VFS, "vers=1.0 (cifs) not permitted when mounting with smb3\n"); - return 1; - } - cifs_dbg(VFS, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n"); - vol->ops = &smb1_operations; - vol->vals = &smb1_values; - break; - case Smb_20: - if (disable_legacy_dialects) { - cifs_dbg(VFS, "mount with legacy dialect disabled\n"); - return 1; - } - if (is_smb3) { - cifs_dbg(VFS, "vers=2.0 not permitted when mounting with smb3\n"); - return 1; - } - vol->ops = &smb20_operations; - vol->vals = &smb20_values; - break; -#else - case Smb_1: - cifs_dbg(VFS, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n"); - return 1; - case Smb_20: - cifs_dbg(VFS, "vers=2.0 mount not permitted when legacy dialects disabled\n"); - return 1; -#endif /* CIFS_ALLOW_INSECURE_LEGACY */ - case Smb_21: - vol->ops = &smb21_operations; - vol->vals = &smb21_values; - break; - case Smb_30: - vol->ops = &smb30_operations; - vol->vals = &smb30_values; - break; - case Smb_302: - vol->ops = &smb30_operations; /* currently identical with 3.0 */ - vol->vals = &smb302_values; - break; - case Smb_311: - vol->ops = &smb311_operations; - vol->vals = &smb311_values; - break; - case Smb_3any: - vol->ops = &smb30_operations; /* currently identical with 3.0 */ - vol->vals = &smb3any_values; - break; - case Smb_default: - vol->ops = &smb30_operations; /* currently identical with 3.0 */ - vol->vals = &smbdefault_values; - break; - default: - cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); - return 1; - } - return 0; -} - /* * Parse a devname into substrings and populate the vol->UNC and vol->prepath * fields with the result. Returns 0 on success and an error otherwise. @@ -3595,7 +3378,10 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) */ tcon->retry = volume_info->retry; tcon->nocase = volume_info->nocase; - tcon->nohandlecache = volume_info->nohandlecache; + if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING) + tcon->nohandlecache = volume_info->nohandlecache; + else + tcon->nohandlecache = 1; tcon->nodelete = volume_info->nodelete; tcon->local_lease = volume_info->local_lease; INIT_LIST_HEAD(&tcon->pending_opens); @@ -3889,13 +3675,21 @@ generic_ip_connect(struct TCP_Server_Info *server) saddr = (struct sockaddr *) &server->dstaddr; if (server->dstaddr.ss_family == AF_INET6) { - sport = ((struct sockaddr_in6 *) saddr)->sin6_port; + struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&server->dstaddr; + + sport = ipv6->sin6_port; slen = sizeof(struct sockaddr_in6); sfamily = AF_INET6; + cifs_dbg(FYI, "%s: connecting to [%pI6]:%d\n", __func__, &ipv6->sin6_addr, + ntohs(sport)); } else { - sport = ((struct sockaddr_in *) saddr)->sin_port; + struct sockaddr_in *ipv4 = (struct sockaddr_in *)&server->dstaddr; + + sport = ipv4->sin_port; slen = sizeof(struct sockaddr_in); sfamily = AF_INET; + cifs_dbg(FYI, "%s: connecting to %pI4:%d\n", __func__, &ipv4->sin_addr, + ntohs(sport)); } if (socket == NULL) { diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c new file mode 100644 index 000000000000..ad6c2fed4055 --- /dev/null +++ b/fs/cifs/fs_context.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020, Microsoft Corporation. + * + * Author(s): Steve French <stfrench@microsoft.com> + * David Howells <dhowells@redhat.com> + */ + +#include "cifsglob.h" +#include "cifs_debug.h" +#include "fs_context.h" + +static const match_table_t cifs_smb_version_tokens = { + { Smb_1, SMB1_VERSION_STRING }, + { Smb_20, SMB20_VERSION_STRING}, + { Smb_21, SMB21_VERSION_STRING }, + { Smb_30, SMB30_VERSION_STRING }, + { Smb_302, SMB302_VERSION_STRING }, + { Smb_302, ALT_SMB302_VERSION_STRING }, + { Smb_311, SMB311_VERSION_STRING }, + { Smb_311, ALT_SMB311_VERSION_STRING }, + { Smb_3any, SMB3ANY_VERSION_STRING }, + { Smb_default, SMBDEFAULT_VERSION_STRING }, + { Smb_version_err, NULL } +}; + +int +cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3) +{ + substring_t args[MAX_OPT_ARGS]; + + switch (match_token(value, cifs_smb_version_tokens, args)) { +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY + case Smb_1: + if (disable_legacy_dialects) { + cifs_dbg(VFS, "mount with legacy dialect disabled\n"); + return 1; + } + if (is_smb3) { + cifs_dbg(VFS, "vers=1.0 (cifs) not permitted when mounting with smb3\n"); + return 1; + } + cifs_dbg(VFS, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n"); + vol->ops = &smb1_operations; + vol->vals = &smb1_values; + break; + case Smb_20: + if (disable_legacy_dialects) { + cifs_dbg(VFS, "mount with legacy dialect disabled\n"); + return 1; + } + if (is_smb3) { + cifs_dbg(VFS, "vers=2.0 not permitted when mounting with smb3\n"); + return 1; + } + vol->ops = &smb20_operations; + vol->vals = &smb20_values; + break; +#else + case Smb_1: + cifs_dbg(VFS, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n"); + return 1; + case Smb_20: + cifs_dbg(VFS, "vers=2.0 mount not permitted when legacy dialects disabled\n"); + return 1; +#endif /* CIFS_ALLOW_INSECURE_LEGACY */ + case Smb_21: + vol->ops = &smb21_operations; + vol->vals = &smb21_values; + break; + case Smb_30: + vol->ops = &smb30_operations; + vol->vals = &smb30_values; + break; + case Smb_302: + vol->ops = &smb30_operations; /* currently identical with 3.0 */ + vol->vals = &smb302_values; + break; + case Smb_311: + vol->ops = &smb311_operations; + vol->vals = &smb311_values; + break; + case Smb_3any: + vol->ops = &smb30_operations; /* currently identical with 3.0 */ + vol->vals = &smb3any_values; + break; + case Smb_default: + vol->ops = &smb30_operations; /* currently identical with 3.0 */ + vol->vals = &smbdefault_values; + break; + default: + cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); + return 1; + } + return 0; +} + +static const match_table_t cifs_secflavor_tokens = { + { Opt_sec_krb5, "krb5" }, + { Opt_sec_krb5i, "krb5i" }, + { Opt_sec_krb5p, "krb5p" }, + { Opt_sec_ntlmsspi, "ntlmsspi" }, + { Opt_sec_ntlmssp, "ntlmssp" }, + { Opt_ntlm, "ntlm" }, + { Opt_sec_ntlmi, "ntlmi" }, + { Opt_sec_ntlmv2, "nontlm" }, + { Opt_sec_ntlmv2, "ntlmv2" }, + { Opt_sec_ntlmv2i, "ntlmv2i" }, + { Opt_sec_lanman, "lanman" }, + { Opt_sec_none, "none" }, + + { Opt_sec_err, NULL } +}; + +int cifs_parse_security_flavors(char *value, struct smb_vol *vol) +{ + + substring_t args[MAX_OPT_ARGS]; + + /* + * With mount options, the last one should win. Reset any existing + * settings back to default. + */ + vol->sectype = Unspecified; + vol->sign = false; + + switch (match_token(value, cifs_secflavor_tokens, args)) { + case Opt_sec_krb5p: + cifs_dbg(VFS, "sec=krb5p is not supported!\n"); + return 1; + case Opt_sec_krb5i: + vol->sign = true; + fallthrough; + case Opt_sec_krb5: + vol->sectype = Kerberos; + break; + case Opt_sec_ntlmsspi: + vol->sign = true; + fallthrough; + case Opt_sec_ntlmssp: + vol->sectype = RawNTLMSSP; + break; + case Opt_sec_ntlmi: + vol->sign = true; + fallthrough; + case Opt_ntlm: + vol->sectype = NTLM; + break; + case Opt_sec_ntlmv2i: + vol->sign = true; + fallthrough; + case Opt_sec_ntlmv2: + vol->sectype = NTLMv2; + break; +#ifdef CONFIG_CIFS_WEAK_PW_HASH + case Opt_sec_lanman: + vol->sectype = LANMAN; + break; +#endif + case Opt_sec_none: + vol->nullauth = 1; + break; + default: + cifs_dbg(VFS, "bad security option: %s\n", value); + return 1; + } + + return 0; +} + +static const match_table_t cifs_cacheflavor_tokens = { + { Opt_cache_loose, "loose" }, + { Opt_cache_strict, "strict" }, + { Opt_cache_none, "none" }, + { Opt_cache_ro, "ro" }, + { Opt_cache_rw, "singleclient" }, + { Opt_cache_err, NULL } +}; + +int +cifs_parse_cache_flavor(char *value, struct smb_vol *vol) +{ + substring_t args[MAX_OPT_ARGS]; + + switch (match_token(value, cifs_cacheflavor_tokens, args)) { + case Opt_cache_loose: + vol->direct_io = false; + vol->strict_io = false; + vol->cache_ro = false; + vol->cache_rw = false; + break; + case Opt_cache_strict: + vol->direct_io = false; + vol->strict_io = true; + vol->cache_ro = false; + vol->cache_rw = false; + break; + case Opt_cache_none: + vol->direct_io = true; + vol->strict_io = false; + vol->cache_ro = false; + vol->cache_rw = false; + break; + case Opt_cache_ro: + vol->direct_io = false; + vol->strict_io = false; + vol->cache_ro = true; + vol->cache_rw = false; + break; + case Opt_cache_rw: + vol->direct_io = false; + vol->strict_io = false; + vol->cache_ro = false; + vol->cache_rw = true; + break; + default: + cifs_dbg(VFS, "bad cache= option: %s\n", value); + return 1; + } + return 0; +} diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h new file mode 100644 index 000000000000..886208a1b0ef --- /dev/null +++ b/fs/cifs/fs_context.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2020, Microsoft Corporation. + * + * Author(s): Steve French <stfrench@microsoft.com> + * David Howells <dhowells@redhat.com> + */ + +#ifndef _FS_CONTEXT_H +#define _FS_CONTEXT_H + +#include <linux/parser.h> +#include "cifsglob.h" + +enum smb_version { + Smb_1 = 1, + Smb_20, + Smb_21, + Smb_30, + Smb_302, + Smb_311, + Smb_3any, + Smb_default, + Smb_version_err +}; + +int cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3); + +enum { + Opt_cache_loose, + Opt_cache_strict, + Opt_cache_none, + Opt_cache_ro, + Opt_cache_rw, + Opt_cache_err +}; + +int cifs_parse_cache_flavor(char *value, struct smb_vol *vol); + +enum cifs_sec_param { + Opt_sec_krb5, + Opt_sec_krb5i, + Opt_sec_krb5p, + Opt_sec_ntlmsspi, + Opt_sec_ntlmssp, + Opt_ntlm, + Opt_sec_ntlmi, + Opt_sec_ntlmv2, + Opt_sec_ntlmv2i, + Opt_sec_lanman, + Opt_sec_none, + + Opt_sec_err +}; + +int cifs_parse_security_flavors(char *value, struct smb_vol *vol); + +#endif diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 1f75b25e559a..9ee5f304592f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -656,7 +656,7 @@ smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo * static void cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, struct super_block *sb, bool adjust_tz, - bool symlink) + bool symlink, u32 reparse_tag) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); @@ -684,8 +684,22 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, fattr->cf_createtime = le64_to_cpu(info->CreationTime); fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); - - if (symlink) { + if (reparse_tag == IO_REPARSE_TAG_LX_SYMLINK) { + fattr->cf_mode |= S_IFLNK | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_LNK; + } else if (reparse_tag == IO_REPARSE_TAG_LX_FIFO) { + fattr->cf_mode |= S_IFIFO | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_FIFO; + } else if (reparse_tag == IO_REPARSE_TAG_AF_UNIX) { + fattr->cf_mode |= S_IFSOCK | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_SOCK; + } else if (reparse_tag == IO_REPARSE_TAG_LX_CHR) { + fattr->cf_mode |= S_IFCHR | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_CHR; + } else if (reparse_tag == IO_REPARSE_TAG_LX_BLK) { + fattr->cf_mode |= S_IFBLK | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_BLK; + } else if (symlink) { /* TODO add more reparse tag checks */ fattr->cf_mode = S_IFLNK; fattr->cf_dtype = DT_LNK; } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { @@ -740,8 +754,9 @@ cifs_get_file_info(struct file *filp) rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data); switch (rc) { case 0: + /* TODO: add support to query reparse tag */ cifs_all_info_to_fattr(&fattr, &find_data, inode->i_sb, false, - false); + false, 0 /* no reparse tag */); break; case -EREMOTE: cifs_create_dfs_fattr(&fattr, inode->i_sb); @@ -910,12 +925,13 @@ cifs_get_inode_info(struct inode **inode, struct cifs_sb_info *cifs_sb = CIFS_SB(sb); bool adjust_tz = false; struct cifs_fattr fattr = {0}; - bool symlink = false; + bool is_reparse_point = false; FILE_ALL_INFO *data = in_data; FILE_ALL_INFO *tmp_data = NULL; void *smb1_backup_rsp_buf = NULL; int rc = 0; int tmprc = 0; + __u32 reparse_tag = 0; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -938,8 +954,8 @@ cifs_get_inode_info(struct inode **inode, goto out; } rc = server->ops->query_path_info(xid, tcon, cifs_sb, - full_path, tmp_data, - &adjust_tz, &symlink); + full_path, tmp_data, + &adjust_tz, &is_reparse_point); data = tmp_data; } @@ -949,7 +965,19 @@ cifs_get_inode_info(struct inode **inode, switch (rc) { case 0: - cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, symlink); + /* + * If the file is a reparse point, it is more complicated + * since we have to check if its reparse tag matches a known + * special file type e.g. symlink or fifo or char etc. + */ + if ((le32_to_cpu(data->Attributes) & ATTR_REPARSE) && + server->ops->query_reparse_tag) { + rc = server->ops->query_reparse_tag(xid, tcon, cifs_sb, + full_path, &reparse_tag); + cifs_dbg(FYI, "reparse tag 0x%x\n", reparse_tag); + } + cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, + is_reparse_point, reparse_tag); break; case -EREMOTE: /* DFS link, no metadata available on this server */ @@ -2883,13 +2911,18 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs) { struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb); + int rc, retries = 0; - if (pTcon->unix_ext) - return cifs_setattr_unix(direntry, attrs); - - return cifs_setattr_nounix(direntry, attrs); + do { + if (pTcon->unix_ext) + rc = cifs_setattr_unix(direntry, attrs); + else + rc = cifs_setattr_nounix(direntry, attrs); + retries++; + } while (is_retryable_error(rc) && retries < 2); /* BB: add cifs_setattr_legacy for really old servers */ + return rc; } #if 0 diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 6df0922e7e30..799be3a5d25e 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -168,10 +168,33 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) fattr->cf_uid = cifs_sb->mnt_uid; fattr->cf_gid = cifs_sb->mnt_gid; + /* + * The IO_REPARSE_TAG_LX_ tags originally were used by WSL but they + * are preferred by the Linux client in some cases since, unlike + * the NFS reparse tag (or EAs), they don't require an extra query + * to determine which type of special file they represent. + * TODO: go through all documented reparse tags to see if we can + * reasonably map some of them to directories vs. files vs. symlinks + */ if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; fattr->cf_dtype = DT_DIR; - } else { + } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_SYMLINK) { + fattr->cf_mode |= S_IFLNK | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_LNK; + } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_FIFO) { + fattr->cf_mode |= S_IFIFO | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_FIFO; + } else if (fattr->cf_cifstag == IO_REPARSE_TAG_AF_UNIX) { + fattr->cf_mode |= S_IFSOCK | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_SOCK; + } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_CHR) { + fattr->cf_mode |= S_IFCHR | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_CHR; + } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_BLK) { + fattr->cf_mode |= S_IFBLK | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_BLK; + } else { /* TODO: should we mark some other reparse points (like DFSR) as directories? */ fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; fattr->cf_dtype = DT_REG; } @@ -267,9 +290,8 @@ cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info, if (reparse_file_needs_reval(fattr)) fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; - /* TODO map SIDs */ - fattr->cf_uid = cifs_sb->mnt_uid; - fattr->cf_gid = cifs_sb->mnt_gid; + sid_to_id(cifs_sb, &parsed.owner, fattr, SIDOWNER); + sid_to_id(cifs_sb, &parsed.group, fattr, SIDGROUP); } static void __dir_info_to_fattr(struct cifs_fattr *fattr, const void *info) @@ -360,11 +382,11 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb, */ static int -initiate_cifs_search(const unsigned int xid, struct file *file) +initiate_cifs_search(const unsigned int xid, struct file *file, + char *full_path) { __u16 search_flags; int rc = 0; - char *full_path = NULL; struct cifsFileInfo *cifsFile; struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); struct tcon_link *tlink = NULL; @@ -400,12 +422,6 @@ initiate_cifs_search(const unsigned int xid, struct file *file) cifsFile->invalidHandle = true; cifsFile->srch_inf.endOfSearch = false; - full_path = build_path_from_dentry(file_dentry(file)); - if (full_path == NULL) { - rc = -ENOMEM; - goto error_exit; - } - cifs_dbg(FYI, "Full path: %s start at: %lld\n", full_path, file->f_pos); ffirst_retry: @@ -444,7 +460,6 @@ ffirst_retry: goto ffirst_retry; } error_exit: - kfree(full_path); cifs_put_tlink(tlink); return rc; } @@ -688,7 +703,8 @@ static int cifs_save_resume_key(const char *current_entry, */ static int find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, - struct file *file, char **current_entry, int *num_to_ret) + struct file *file, char *full_path, + char **current_entry, int *num_to_ret) { __u16 search_flags; int rc = 0; @@ -741,7 +757,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, ntwrk_buf_start); cfile->srch_inf.ntwrk_buf_start = NULL; } - rc = initiate_cifs_search(xid, file); + rc = initiate_cifs_search(xid, file, full_path); if (rc) { cifs_dbg(FYI, "error %d reinitiating a search on rewind\n", rc); @@ -925,15 +941,22 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) char *tmp_buf = NULL; char *end_of_smb; unsigned int max_len; + char *full_path = NULL; xid = get_xid(); + full_path = build_path_from_dentry(file_dentry(file)); + if (full_path == NULL) { + rc = -ENOMEM; + goto rddir2_exit; + } + /* * Ensure FindFirst doesn't fail before doing filldir() for '.' and * '..'. Otherwise we won't be able to notify VFS in case of failure. */ if (file->private_data == NULL) { - rc = initiate_cifs_search(xid, file); + rc = initiate_cifs_search(xid, file, full_path); cifs_dbg(FYI, "initiate cifs search rc %d\n", rc); if (rc) goto rddir2_exit; @@ -960,8 +983,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) } */ tcon = tlink_tcon(cifsFile->tlink); - rc = find_cifs_entry(xid, tcon, ctx->pos, file, ¤t_entry, - &num_to_fill); + rc = find_cifs_entry(xid, tcon, ctx->pos, file, full_path, + ¤t_entry, &num_to_fill); if (rc) { cifs_dbg(FYI, "fce error %d\n", rc); goto rddir2_exit; @@ -1019,6 +1042,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) kfree(tmp_buf); rddir2_exit: + kfree(full_path); free_xid(xid); return rc; } diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index cf20f0b5d836..99a1951a01ec 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -58,6 +58,7 @@ #define SMB2_HMACSHA256_SIZE (32) #define SMB2_CMACAES_SIZE (16) #define SMB3_SIGNKEY_SIZE (16) +#define SMB3_GCM256_CRYPTKEY_SIZE (32) /* Maximum buffer size value we can send with 1 credit */ #define SMB2_MAX_BUFFER_SIZE 65536 diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index eba01d0908dd..1f900b81c34a 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -506,17 +506,17 @@ move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src) int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - FILE_ALL_INFO *data, bool *adjust_tz, bool *symlink) + FILE_ALL_INFO *data, bool *adjust_tz, bool *reparse) { int rc; struct smb2_file_all_info *smb2_data; __u32 create_options = 0; - struct cifs_fid fid; bool no_cached_open = tcon->nohandlecache; struct cifsFileInfo *cfile; + struct cached_fid *cfid = NULL; *adjust_tz = false; - *symlink = false; + *reparse = false; smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, GFP_KERNEL); @@ -525,7 +525,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, /* If it is a root and its handle is cached then use it */ if (!strlen(full_path) && !no_cached_open) { - rc = open_shroot(xid, tcon, cifs_sb, &fid); + rc = open_shroot(xid, tcon, cifs_sb, &cfid); if (rc) goto out; @@ -533,12 +533,13 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, move_smb2_info_to_cifs(data, &tcon->crfid.file_all_info); } else { - rc = SMB2_query_info(xid, tcon, fid.persistent_fid, - fid.volatile_fid, smb2_data); + rc = SMB2_query_info(xid, tcon, + cfid->fid->persistent_fid, + cfid->fid->volatile_fid, smb2_data); if (!rc) move_smb2_info_to_cifs(data, smb2_data); } - close_shroot(&tcon->crfid); + close_shroot(cfid); goto out; } @@ -547,7 +548,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, smb2_data, SMB2_OP_QUERY_INFO, cfile); if (rc == -EOPNOTSUPP) { - *symlink = true; + *reparse = true; create_options |= OPEN_REPARSE_POINT; /* Failed on a symbolic link - query a reparse point info */ @@ -569,7 +570,7 @@ out: int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - struct smb311_posix_qinfo *data, bool *adjust_tz, bool *symlink) + struct smb311_posix_qinfo *data, bool *adjust_tz, bool *reparse) { int rc; __u32 create_options = 0; @@ -577,7 +578,7 @@ smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct smb311_posix_qinfo *smb2_data; *adjust_tz = false; - *symlink = false; + *reparse = false; /* BB TODO: Make struct larger when add support for parsing owner SIDs */ smb2_data = kzalloc(sizeof(struct smb311_posix_qinfo), @@ -598,7 +599,7 @@ smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, ACL_NO_MODE, smb2_data, SMB2_OP_POSIX_QUERY_INFO, cfile); if (rc == -EOPNOTSUPP) { /* BB TODO: When support for special files added to Samba re-verify this path */ - *symlink = true; + *reparse = true; create_options |= OPEN_REPARSE_POINT; /* Failed on a symbolic link - query a reparse point info */ diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 7fde3775cb57..c775682ee973 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -488,7 +488,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_PIPE_CONNECTED, -EIO, "STATUS_PIPE_CONNECTED"}, {STATUS_PIPE_LISTENING, -EIO, "STATUS_PIPE_LISTENING"}, {STATUS_INVALID_READ_MODE, -EIO, "STATUS_INVALID_READ_MODE"}, - {STATUS_IO_TIMEOUT, -ETIMEDOUT, "STATUS_IO_TIMEOUT"}, + {STATUS_IO_TIMEOUT, -EAGAIN, "STATUS_IO_TIMEOUT"}, {STATUS_FILE_FORCED_CLOSED, -EIO, "STATUS_FILE_FORCED_CLOSED"}, {STATUS_PROFILING_NOT_STARTED, -EIO, "STATUS_PROFILING_NOT_STARTED"}, {STATUS_PROFILING_NOT_STOPPED, -EIO, "STATUS_PROFILING_NOT_STOPPED"}, @@ -814,7 +814,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_INVALID_VARIANT, -EIO, "STATUS_INVALID_VARIANT"}, {STATUS_DOMAIN_CONTROLLER_NOT_FOUND, -EIO, "STATUS_DOMAIN_CONTROLLER_NOT_FOUND"}, - {STATUS_ACCOUNT_LOCKED_OUT, -EIO, "STATUS_ACCOUNT_LOCKED_OUT"}, + {STATUS_ACCOUNT_LOCKED_OUT, -EACCES, "STATUS_ACCOUNT_LOCKED_OUT"}, {STATUS_HANDLE_NOT_CLOSABLE, -EIO, "STATUS_HANDLE_NOT_CLOSABLE"}, {STATUS_CONNECTION_REFUSED, -EIO, "STATUS_CONNECTION_REFUSED"}, {STATUS_GRACEFUL_DISCONNECT, -EIO, "STATUS_GRACEFUL_DISCONNECT"}, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d44df8f95bcd..504766cb6c19 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -72,7 +72,7 @@ smb2_add_credits(struct TCP_Server_Info *server, /* eg found case where write overlapping reconnect messed up credits */ if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0)) trace_smb3_reconnect_with_invalid_credits(server->CurrentMid, - server->hostname, *val); + server->hostname, *val, add); if ((instance == 0) || (instance == server->reconnect_instance)) *val += add; else @@ -121,6 +121,8 @@ smb2_add_credits(struct TCP_Server_Info *server, cifs_dbg(FYI, "disabling oplocks\n"); break; default: + trace_smb3_add_credits(server->CurrentMid, + server->hostname, rc, add); cifs_dbg(FYI, "add %u credits total=%d\n", add, rc); } } @@ -651,7 +653,8 @@ smb2_cached_lease_break(struct work_struct *work) * Open the directory at the root of a share */ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, struct cifs_fid *pfid) + struct cifs_sb_info *cifs_sb, + struct cached_fid **cfid) { struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server = ses->server; @@ -666,11 +669,12 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, int rc, flags = 0; __le16 utf16_path = 0; /* Null - since an open of top of share */ u8 oplock = SMB2_OPLOCK_LEVEL_II; + struct cifs_fid *pfid; mutex_lock(&tcon->crfid.fid_mutex); if (tcon->crfid.is_valid) { cifs_dbg(FYI, "found a cached root file handle\n"); - memcpy(pfid, tcon->crfid.fid, sizeof(struct cifs_fid)); + *cfid = &tcon->crfid; kref_get(&tcon->crfid.refcount); mutex_unlock(&tcon->crfid.fid_mutex); return 0; @@ -691,6 +695,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, if (!server->ops->new_lease_key) return -EIO; + pfid = tcon->crfid.fid; server->ops->new_lease_key(pfid); memset(rqst, 0, sizeof(rqst)); @@ -820,6 +825,8 @@ oshr_free: SMB2_query_info_free(&rqst[1]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + if (rc == 0) + *cfid = &tcon->crfid; return rc; } @@ -833,6 +840,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; bool no_cached_open = tcon->nohandlecache; + struct cached_fid *cfid = NULL; oparms.tcon = tcon; oparms.desired_access = FILE_READ_ATTRIBUTES; @@ -841,12 +849,14 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - if (no_cached_open) + if (no_cached_open) { rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL, NULL); - else - rc = open_shroot(xid, tcon, cifs_sb, &fid); - + } else { + rc = open_shroot(xid, tcon, cifs_sb, &cfid); + if (rc == 0) + memcpy(&fid, cfid->fid, sizeof(struct cifs_fid)); + } if (rc) return; @@ -863,7 +873,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, if (no_cached_open) SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); else - close_shroot(&tcon->crfid); + close_shroot(cfid); } static void @@ -2346,6 +2356,17 @@ smb2_is_session_expired(char *buf) return true; } +static bool +smb2_is_status_io_timeout(char *buf) +{ + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + + if (shdr->Status == STATUS_IO_TIMEOUT) + return true; + else + return false; +} + static int smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, struct cifsInodeInfo *cinode) @@ -3013,6 +3034,133 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, return rc; } +int +smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + __u32 *tag) +{ + int rc; + __le16 *utf16_path = NULL; + __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); + int flags = 0; + struct smb_rqst rqst[3]; + int resp_buftype[3]; + struct kvec rsp_iov[3]; + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; + struct kvec close_iov[1]; + struct smb2_ioctl_rsp *ioctl_rsp; + struct reparse_data_buffer *reparse_buf; + u32 plen; + + cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + memset(rqst, 0, sizeof(rqst)); + resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; + memset(rsp_iov, 0, sizeof(rsp_iov)); + + utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + /* + * setup smb2open - TODO add optimization to call cifs_get_readable_path + * to see if there is a handle already open that we can use + */ + memset(&open_iov, 0, sizeof(open_iov)); + rqst[0].rq_iov = open_iov; + rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; + + memset(&oparms, 0, sizeof(oparms)); + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.create_options = cifs_create_options(cifs_sb, OPEN_REPARSE_POINT); + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open_init(tcon, server, + &rqst[0], &oplock, &oparms, utf16_path); + if (rc) + goto query_rp_exit; + smb2_set_next_command(tcon, &rqst[0]); + + + /* IOCTL */ + memset(&io_iov, 0, sizeof(io_iov)); + rqst[1].rq_iov = io_iov; + rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; + + rc = SMB2_ioctl_init(tcon, server, + &rqst[1], fid.persistent_fid, + fid.volatile_fid, FSCTL_GET_REPARSE_POINT, + true /* is_fctl */, NULL, 0, + CIFSMaxBufSize - + MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE); + if (rc) + goto query_rp_exit; + + smb2_set_next_command(tcon, &rqst[1]); + smb2_set_related(&rqst[1]); + + + /* Close */ + memset(&close_iov, 0, sizeof(close_iov)); + rqst[2].rq_iov = close_iov; + rqst[2].rq_nvec = 1; + + rc = SMB2_close_init(tcon, server, + &rqst[2], COMPOUND_FID, COMPOUND_FID, false); + if (rc) + goto query_rp_exit; + + smb2_set_related(&rqst[2]); + + rc = compound_send_recv(xid, tcon->ses, server, + flags, 3, rqst, + resp_buftype, rsp_iov); + + ioctl_rsp = rsp_iov[1].iov_base; + + /* + * Open was successful and we got an ioctl response. + */ + if (rc == 0) { + /* See MS-FSCC 2.3.23 */ + + reparse_buf = (struct reparse_data_buffer *) + ((char *)ioctl_rsp + + le32_to_cpu(ioctl_rsp->OutputOffset)); + plen = le32_to_cpu(ioctl_rsp->OutputCount); + + if (plen + le32_to_cpu(ioctl_rsp->OutputOffset) > + rsp_iov[1].iov_len) { + cifs_tcon_dbg(FYI, "srv returned invalid ioctl len: %d\n", + plen); + rc = -EIO; + goto query_rp_exit; + } + *tag = le32_to_cpu(reparse_buf->ReparseTag); + } + + query_rp_exit: + kfree(utf16_path); + SMB2_open_free(&rqst[0]); + SMB2_ioctl_free(&rqst[1]); + SMB2_close_free(&rqst[2]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + return rc; +} + static struct cifs_ntsd * get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb, const struct cifs_fid *cifsfid, u32 *pacllen) @@ -3072,7 +3220,12 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, oparms.tcon = tcon; oparms.desired_access = READ_CONTROL; oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); + /* + * When querying an ACL, even if the file is a symlink we want to open + * the source not the target, and so the protocol requires that the + * client specify this flag when opening a reparse point + */ + oparms.create_options = cifs_create_options(cifs_sb, 0) | OPEN_REPARSE_POINT; oparms.fid = &fid; oparms.reconnect = false; @@ -3801,10 +3954,11 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM; tr_hdr->OriginalMessageSize = cpu_to_le32(orig_len); tr_hdr->Flags = cpu_to_le16(0x01); - if (cipher_type == SMB2_ENCRYPTION_AES128_GCM) - get_random_bytes(&tr_hdr->Nonce, SMB3_AES128GCM_NONCE); + if ((cipher_type == SMB2_ENCRYPTION_AES128_GCM) || + (cipher_type == SMB2_ENCRYPTION_AES256_GCM)) + get_random_bytes(&tr_hdr->Nonce, SMB3_AES_GCM_NONCE); else - get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CCM_NONCE); + get_random_bytes(&tr_hdr->Nonce, SMB3_AES_CCM_NONCE); memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8); } @@ -3924,7 +4078,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, if (rc) { cifs_server_dbg(VFS, "%s: Could not get %scryption key\n", __func__, enc ? "en" : "de"); - return 0; + return rc; } rc = smb3_crypto_aead_allocate(server); @@ -3935,7 +4089,12 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, tfm = enc ? server->secmech.ccmaesencrypt : server->secmech.ccmaesdecrypt; - rc = crypto_aead_setkey(tfm, key, SMB3_SIGN_KEY_SIZE); + + if (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM) + rc = crypto_aead_setkey(tfm, key, SMB3_GCM256_CRYPTKEY_SIZE); + else + rc = crypto_aead_setkey(tfm, key, SMB3_SIGN_KEY_SIZE); + if (rc) { cifs_server_dbg(VFS, "%s: Failed to set aead key %d\n", __func__, rc); return rc; @@ -3973,11 +4132,12 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, goto free_sg; } - if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) - memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES128GCM_NONCE); + if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) + memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES_GCM_NONCE); else { iv[0] = 3; - memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES128CCM_NONCE); + memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES_CCM_NONCE); } aead_request_set_crypt(req, sg, sg, crypt_len, iv); @@ -4103,7 +4263,8 @@ smb3_is_transform_hdr(void *buf) static int decrypt_raw_data(struct TCP_Server_Info *server, char *buf, unsigned int buf_data_size, struct page **pages, - unsigned int npages, unsigned int page_data_size) + unsigned int npages, unsigned int page_data_size, + bool is_offloaded) { struct kvec iov[2]; struct smb_rqst rqst = {NULL}; @@ -4129,7 +4290,8 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, memmove(buf, iov[1].iov_base, buf_data_size); - server->total_read = buf_data_size + page_data_size; + if (!is_offloaded) + server->total_read = buf_data_size + page_data_size; return rc; } @@ -4342,7 +4504,7 @@ static void smb2_decrypt_offload(struct work_struct *work) struct mid_q_entry *mid; rc = decrypt_raw_data(dw->server, dw->buf, dw->server->vals->read_rsp_size, - dw->ppages, dw->npages, dw->len); + dw->ppages, dw->npages, dw->len, true); if (rc) { cifs_dbg(VFS, "error decrypting rc=%d\n", rc); goto free_pages; @@ -4448,7 +4610,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, non_offloaded_decrypt: rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size, - pages, npages, len); + pages, npages, len, false); if (rc) goto free_pages; @@ -4504,7 +4666,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, server->total_read += length; buf_size = pdu_length - sizeof(struct smb2_transform_hdr); - length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0); + length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0, false); if (length) return length; @@ -4809,6 +4971,7 @@ struct smb_version_operations smb20_operations = { .make_node = smb2_make_node, .fiemap = smb3_fiemap, .llseek = smb3_llseek, + .is_status_io_timeout = smb2_is_status_io_timeout, }; struct smb_version_operations smb21_operations = { @@ -4909,6 +5072,7 @@ struct smb_version_operations smb21_operations = { .make_node = smb2_make_node, .fiemap = smb3_fiemap, .llseek = smb3_llseek, + .is_status_io_timeout = smb2_is_status_io_timeout, }; struct smb_version_operations smb30_operations = { @@ -4949,6 +5113,8 @@ struct smb_version_operations smb30_operations = { .can_echo = smb2_can_echo, .echo = SMB2_echo, .query_path_info = smb2_query_path_info, + /* WSL tags introduced long after smb2.1, enable for SMB3, 3.11 only */ + .query_reparse_tag = smb2_query_reparse_tag, .get_srv_inum = smb2_get_srv_inum, .query_file_info = smb2_query_file_info, .set_path_size = smb2_set_path_size, @@ -5019,6 +5185,7 @@ struct smb_version_operations smb30_operations = { .make_node = smb2_make_node, .fiemap = smb3_fiemap, .llseek = smb3_llseek, + .is_status_io_timeout = smb2_is_status_io_timeout, }; struct smb_version_operations smb311_operations = { @@ -5059,6 +5226,7 @@ struct smb_version_operations smb311_operations = { .can_echo = smb2_can_echo, .echo = SMB2_echo, .query_path_info = smb2_query_path_info, + .query_reparse_tag = smb2_query_reparse_tag, .get_srv_inum = smb2_get_srv_inum, .query_file_info = smb2_query_file_info, .set_path_size = smb2_set_path_size, @@ -5130,6 +5298,7 @@ struct smb_version_operations smb311_operations = { .make_node = smb2_make_node, .fiemap = smb3_fiemap, .llseek = smb3_llseek, + .is_status_io_timeout = smb2_is_status_io_timeout, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 96c172d94fba..445e80862865 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -449,10 +449,22 @@ static void build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt) { pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES; - pneg_ctxt->DataLength = cpu_to_le16(6); /* Cipher Count + two ciphers */ - pneg_ctxt->CipherCount = cpu_to_le16(2); - pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM; - pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM; + if (require_gcm_256) { + pneg_ctxt->DataLength = cpu_to_le16(4); /* Cipher Count + 1 cipher */ + pneg_ctxt->CipherCount = cpu_to_le16(1); + pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES256_GCM; + } else if (enable_gcm_256) { + pneg_ctxt->DataLength = cpu_to_le16(8); /* Cipher Count + 3 ciphers */ + pneg_ctxt->CipherCount = cpu_to_le16(3); + pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM; + pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES256_GCM; + pneg_ctxt->Ciphers[2] = SMB2_ENCRYPTION_AES128_CCM; + } else { + pneg_ctxt->DataLength = cpu_to_le16(6); /* Cipher Count + 2 ciphers */ + pneg_ctxt->CipherCount = cpu_to_le16(2); + pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM; + pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM; + } } static unsigned int @@ -598,8 +610,29 @@ static int decode_encrypt_ctx(struct TCP_Server_Info *server, return -EINVAL; } cifs_dbg(FYI, "SMB311 cipher type:%d\n", le16_to_cpu(ctxt->Ciphers[0])); - if ((ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_CCM) && - (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_GCM)) { + if (require_gcm_256) { + if (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES256_GCM) { + cifs_dbg(VFS, "Server does not support requested encryption type (AES256 GCM)\n"); + return -EOPNOTSUPP; + } + } else if (ctxt->Ciphers[0] == 0) { + /* + * e.g. if server only supported AES256_CCM (very unlikely) + * or server supported no encryption types or had all disabled. + * Since GLOBAL_CAP_ENCRYPTION will be not set, in the case + * in which mount requested encryption ("seal") checks later + * on during tree connection will return proper rc, but if + * seal not requested by client, since server is allowed to + * return 0 to indicate no supported cipher, we can't fail here + */ + server->cipher_type = 0; + server->capabilities &= ~SMB2_GLOBAL_CAP_ENCRYPTION; + pr_warn_once("Server does not support requested encryption types\n"); + return 0; + } else if ((ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_CCM) && + (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_GCM) && + (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES256_GCM)) { + /* server returned a cipher we didn't ask for */ pr_warn_once("Invalid SMB3.11 cipher returned\n"); return -EINVAL; } @@ -1948,9 +1981,11 @@ smb2_parse_contexts(struct TCP_Server_Info *server, unsigned int next; unsigned int remaining; char *name; - const char smb3_create_tag_posix[] = {0x93, 0xAD, 0x25, 0x50, 0x9C, - 0xB4, 0x11, 0xE7, 0xB4, 0x23, 0x83, - 0xDE, 0x96, 0x8B, 0xCD, 0x7C}; + static const char smb3_create_tag_posix[] = { + 0x93, 0xAD, 0x25, 0x50, 0x9C, + 0xB4, 0x11, 0xE7, 0xB4, 0x23, 0x83, + 0xDE, 0x96, 0x8B, 0xCD, 0x7C + }; *oplock = 0; data_offset = (char *)rsp + le32_to_cpu(rsp->CreateContextsOffset); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index c3f1baf5bde2..f05f9b12f689 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -128,8 +128,8 @@ struct smb2_sync_pdu { __le16 StructureSize2; /* size of wct area (varies, request specific) */ } __packed; -#define SMB3_AES128CCM_NONCE 11 -#define SMB3_AES128GCM_NONCE 12 +#define SMB3_AES_CCM_NONCE 11 +#define SMB3_AES_GCM_NONCE 12 /* Transform flags (for 3.0 dialect this flag indicates CCM */ #define TRANSFORM_FLAG_ENCRYPTED 0x0001 @@ -153,10 +153,14 @@ struct smb2_compression_transform_hdr { } __packed; /* See MS-SMB2 2.2.42.1 */ +#define SMB2_COMPRESSION_FLAG_NONE 0x0000 +#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001 + struct compression_payload_header { - __le16 AlgorithmId; - __le16 Reserved; - __le32 Length; + __le16 CompressionAlgorithm; + __le16 Flags; + __le32 Length; /* length of compressed playload including field below if present */ + /* __le32 OriginalPayloadSize; */ /* optional */ } __packed; /* See MS-SMB2 2.2.42.2 */ @@ -167,6 +171,26 @@ struct compression_pattern_payload_v1 { __le32 Repetitions; } __packed; +/* See MS-SMB2 2.2.43 */ +struct smb2_rdma_transform { + __le16 RdmaDescriptorOffset; + __le16 RdmaDescriptorLength; + __le32 Channel; /* for values see channel description in smb2 read above */ + __le16 TransformCount; + __le16 Reserved1; + __le32 Reserved2; +} __packed; + +struct smb2_rdma_encryption_transform { + __le16 TransformType; + __le16 SignatureLength; + __le16 NonceLength; + __u16 Reserved; + __u8 Signature[]; /* variable length */ + /* u8 Nonce[] */ + /* followed by padding */ +} __packed; + /* * SMB2 flag definitions */ @@ -297,6 +321,9 @@ struct smb2_negotiate_req { #define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2) #define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3) #define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5) +#define SMB2_TRANSPORT_CAPABILITIES cpu_to_le16(6) +#define SMB2_RDMA_TRANSFORM_CAPABILITIES cpu_to_le16(7) +#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8) #define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100) struct smb2_neg_context { @@ -325,6 +352,9 @@ struct smb2_preauth_neg_context { /* Encryption Algorithms Ciphers */ #define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) #define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002) +/* we currently do not request AES256_CCM since presumably GCM faster */ +#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003) +#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004) /* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */ #define MIN_ENCRYPT_CTXT_DATA_LEN 4 @@ -332,8 +362,9 @@ struct smb2_encryption_neg_context { __le16 ContextType; /* 2 */ __le16 DataLength; __le32 Reserved; - __le16 CipherCount; /* AES-128-GCM and AES-128-CCM */ - __le16 Ciphers[2]; + /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */ + __le16 CipherCount; /* AES128-GCM and AES128-CCM by default */ + __le16 Ciphers[3]; } __packed; /* See MS-SMB2 2.2.3.1.3 */ @@ -351,10 +382,10 @@ struct smb2_encryption_neg_context { struct smb2_compression_capabilities_context { __le16 ContextType; /* 3 */ __le16 DataLength; - __u32 Flags; + __u32 Reserved; __le16 CompressionAlgorithmCount; __u16 Padding; - __u32 Reserved1; + __u32 Flags; __le16 CompressionAlgorithms[3]; } __packed; @@ -363,12 +394,44 @@ struct smb2_compression_capabilities_context { * Its struct simply contains NetName, an array of Unicode characters */ struct smb2_netname_neg_context { - __le16 ContextType; /* 0x100 */ + __le16 ContextType; /* 5 */ __le16 DataLength; __le32 Reserved; __le16 NetName[]; /* hostname of target converted to UCS-2 */ } __packed; +/* + * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6 + * and 2.2.4.1.5 + */ + +/* RDMA Transform IDs */ +#define SMB2_RDMA_TRANSFORM_NONE 0x0000 +#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001 + +struct smb2_rdma_transform_capabilities_context { + __le16 ContextType; /* 7 */ + __le16 DataLength; + __u32 Reserved; + __le16 TransformCount; + __u16 Reserved1; + __u32 Reserved2; + __le16 RDMATransformIds[1]; +} __packed; + +/* Signing algorithms */ +#define SIGNING_ALG_HMAC_SHA256 0 +#define SIGNING_ALG_AES_CMAC 1 +#define SIGNING_ALG_AES_GMAC 2 + +struct smb2_signing_capabilities { + __le16 ContextType; /* 8 */ + __le16 DataLength; + __u32 Reserved; + __le16 SigningAlgorithmCount; + __le16 SigningAlgorithms[]; +} __packed; + #define POSIX_CTXT_DATA_LEN 16 struct smb2_posix_neg_context { __le16 ContextType; /* 0x100 */ @@ -936,6 +999,31 @@ struct copychunk_ioctl_rsp { __le32 TotalBytesWritten; } __packed; +/* See MS-FSCC 2.3.29 and 2.3.30 */ +struct get_retrieval_pointer_count_req { + __le64 StartingVcn; /* virtual cluster number (signed) */ +} __packed; + +struct get_retrieval_pointer_count_rsp { + __le32 ExtentCount; +} __packed; + +/* + * See MS-FSCC 2.3.33 and 2.3.34 + * request is the same as get_retrieval_point_count_req struct above + */ +struct smb3_extents { + __le64 NextVcn; + __le64 Lcn; /* logical cluster number */ +} __packed; + +struct get_retrieval_pointers_refcount_rsp { + __le32 ExtentCount; + __u32 Reserved; + __le64 StartingVcn; + struct smb3_extents extents[]; +} __packed; + struct fsctl_set_integrity_information_req { __le16 ChecksumAlgorithm; __le16 Reserved; @@ -1178,6 +1266,7 @@ struct smb2_flush_rsp { #define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000) #define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001) /* SMB3 or later */ #define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) /* >= SMB3.02 */ +#define SMB2_CHANNEL_RDMA_TRANSFORM cpu_to_le32(0x00000003) /* >= SMB3.02, only used on write */ /* SMB2 read request without RFC1001 length at the beginning */ struct smb2_read_plain_req { @@ -1197,6 +1286,10 @@ struct smb2_read_plain_req { __u8 Buffer[1]; } __packed; +/* Read flags */ +#define SMB2_READFLAG_RESPONSE_NONE 0x00000000 +#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM 0x00000001 + struct smb2_read_rsp { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 17 */ @@ -1204,7 +1297,7 @@ struct smb2_read_rsp { __u8 Reserved; __le32 DataLength; __le32 DataRemaining; - __u32 Reserved2; + __u32 Flags; __u8 Buffer[1]; } __packed; @@ -1572,6 +1665,7 @@ struct smb2_file_rename_info { /* encoding of request for level 10 */ __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ __le32 FileNameLength; char FileName[]; /* New name to be assigned */ + /* padding - overall struct size must be >= 24 so filename + pad >= 6 */ } __packed; /* level 10 Set */ struct smb2_file_link_info { /* encoding of request for level 11 */ @@ -1623,6 +1717,11 @@ struct smb2_file_eof_info { /* encoding of request for level 10 */ __le64 EndOfFile; /* new end of file value */ } __packed; /* level 20 Set */ +struct smb2_file_reparse_point_info { + __le64 IndexNumber; + __le32 Tag; +} __packed; + struct smb2_file_network_open_info { __le64 CreationTime; __le64 LastAccessTime; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 2f8ecbf54214..d4110447ee3a 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -70,12 +70,16 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid); extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, struct cifs_fid *pfid); + struct cifs_sb_info *cifs_sb, + struct cached_fid **cfid); extern void close_shroot(struct cached_fid *cfid); extern void close_shroot_lease(struct cached_fid *cfid); extern void close_shroot_lease_locked(struct cached_fid *cfid); extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src); +extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *path, + __u32 *reparse_tag); extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, FILE_ALL_INFO *data, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index c0348e3b1695..ebccd71cc60a 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -849,12 +849,13 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) struct crypto_aead *tfm; if (!server->secmech.ccmaesencrypt) { - if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) + if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) tfm = crypto_alloc_aead("gcm(aes)", 0, 0); else tfm = crypto_alloc_aead("ccm(aes)", 0, 0); if (IS_ERR(tfm)) { - cifs_server_dbg(VFS, "%s: Failed to alloc encrypt aead\n", + cifs_server_dbg(VFS, "%s: Failed alloc encrypt aead\n", __func__); return PTR_ERR(tfm); } @@ -862,7 +863,8 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) } if (!server->secmech.ccmaesdecrypt) { - if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) + if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) tfm = crypto_alloc_aead("gcm(aes)", 0, 0); else tfm = crypto_alloc_aead("ccm(aes)", 0, 0); diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index 1ff28529cf4b..a0e84747f567 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -103,6 +103,8 @@ #define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */ #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ #define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C +#define FSCTL_GET_RETRIEVAL_POINTERS_AND_REFCOUNT 0x000903d3 +#define FSCTL_GET_RETRIEVAL_POINTER_COUNT 0x0009042b #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF #define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ #define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */ diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index eef4b08c7208..90e0fab69bb8 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -878,33 +878,39 @@ DEFINE_SMB3_RECONNECT_EVENT(partial_send_reconnect); DECLARE_EVENT_CLASS(smb3_credit_class, TP_PROTO(__u64 currmid, char *hostname, - int credits), - TP_ARGS(currmid, hostname, credits), + int credits, + int credits_to_add), + TP_ARGS(currmid, hostname, credits, credits_to_add), TP_STRUCT__entry( __field(__u64, currmid) __field(char *, hostname) __field(int, credits) + __field(int, credits_to_add) ), TP_fast_assign( __entry->currmid = currmid; __entry->hostname = hostname; __entry->credits = credits; + __entry->credits_to_add = credits_to_add; ), - TP_printk("server=%s current_mid=0x%llx credits=%d", + TP_printk("server=%s current_mid=0x%llx credits=%d credits_to_add=%d", __entry->hostname, __entry->currmid, - __entry->credits) + __entry->credits, + __entry->credits_to_add) ) #define DEFINE_SMB3_CREDIT_EVENT(name) \ DEFINE_EVENT(smb3_credit_class, smb3_##name, \ TP_PROTO(__u64 currmid, \ char *hostname, \ - int credits), \ - TP_ARGS(currmid, hostname, credits)) + int credits, \ + int credits_to_add), \ + TP_ARGS(currmid, hostname, credits, credits_to_add)) DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits); DEFINE_SMB3_CREDIT_EVENT(credit_timeout); +DEFINE_SMB3_CREDIT_EVENT(add_credits); #endif /* _CIFS_TRACE_H */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index ac7632482736..e27e255d40dd 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -563,7 +563,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, cifs_num_waiters_dec(server); if (!rc) { trace_smb3_credit_timeout(server->CurrentMid, - server->hostname, num_credits); + server->hostname, num_credits, 0); cifs_server_dbg(VFS, "wait timed out after %d ms\n", timeout); return -ENOTSUPP; @@ -604,7 +604,8 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, if (!rc) { trace_smb3_credit_timeout( server->CurrentMid, - server->hostname, num_credits); + server->hostname, num_credits, + 0); cifs_server_dbg(VFS, "wait timed out after %d ms\n", timeout); return -ENOTSUPP; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 912308600d39..4b90cfd1ec36 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -690,8 +690,7 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = 0; buf->f_files = CRAMFS_SB(sb)->files; buf->f_ffree = 0; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = CRAMFS_MAXPATHLEN; return 0; } @@ -559,8 +559,11 @@ fallback: } /** - * dax_layout_busy_page - find first pinned page in @mapping + * dax_layout_busy_page_range - find first pinned page in @mapping * @mapping: address space to scan for a page with ref count > 1 + * @start: Starting offset. Page containing 'start' is included. + * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX, + * pages from 'start' till the end of file are included. * * DAX requires ZONE_DEVICE mapped pages. These pages are never * 'onlined' to the page allocator so they are considered idle when @@ -573,12 +576,15 @@ fallback: * to be able to run unmap_mapping_range() and subsequently not race * mapping_mapped() becoming true. */ -struct page *dax_layout_busy_page(struct address_space *mapping) +struct page *dax_layout_busy_page_range(struct address_space *mapping, + loff_t start, loff_t end) { - XA_STATE(xas, &mapping->i_pages, 0); void *entry; unsigned int scanned = 0; struct page *page = NULL; + pgoff_t start_idx = start >> PAGE_SHIFT; + pgoff_t end_idx; + XA_STATE(xas, &mapping->i_pages, start_idx); /* * In the 'limited' case get_user_pages() for dax is disabled. @@ -589,6 +595,11 @@ struct page *dax_layout_busy_page(struct address_space *mapping) if (!dax_mapping(mapping) || !mapping_mapped(mapping)) return NULL; + /* If end == LLONG_MAX, all pages from start to till end of file */ + if (end == LLONG_MAX) + end_idx = ULONG_MAX; + else + end_idx = end >> PAGE_SHIFT; /* * If we race get_user_pages_fast() here either we'll see the * elevated page count in the iteration and wait, or @@ -596,15 +607,15 @@ struct page *dax_layout_busy_page(struct address_space *mapping) * against is no longer mapped in the page tables and bail to the * get_user_pages() slow path. The slow path is protected by * pte_lock() and pmd_lock(). New references are not taken without - * holding those locks, and unmap_mapping_range() will not zero the + * holding those locks, and unmap_mapping_pages() will not zero the * pte or pmd without holding the respective lock, so we are * guaranteed to either see new references or prevent new * references from being established. */ - unmap_mapping_range(mapping, 0, 0, 0); + unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0); xas_lock_irq(&xas); - xas_for_each(&xas, entry, ULONG_MAX) { + xas_for_each(&xas, entry, end_idx) { if (WARN_ON_ONCE(!xa_is_value(entry))) continue; if (unlikely(dax_is_locked(entry))) @@ -625,6 +636,12 @@ struct page *dax_layout_busy_page(struct address_space *mapping) xas_unlock_irq(&xas); return page; } +EXPORT_SYMBOL_GPL(dax_layout_busy_page_range); + +struct page *dax_layout_busy_page(struct address_space *mapping) +{ + return dax_layout_busy_page_range(mapping, 0, LLONG_MAX); +} EXPORT_SYMBOL_GPL(dax_layout_busy_page); static int __dax_invalidate_entry(struct address_space *mapping, diff --git a/fs/efs/super.c b/fs/efs/super.c index a4a945d0ac6a..62b155b9366b 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -342,8 +342,7 @@ static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { sbi->inode_blocks * (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); buf->f_ffree = sbi->inode_free; /* free inodes */ - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ return 0; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index b9a09806512a..be10b16ea66e 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -561,8 +561,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = EROFS_NAME_LEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 573659bfbc55..916797077aad 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -59,9 +59,9 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb, } /* read a directory entry from the opened directory */ -static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry) +static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_entry *dir_entry) { - int i, dentries_per_clu, dentries_per_clu_bits = 0; + int i, dentries_per_clu, dentries_per_clu_bits = 0, num_ext; unsigned int type, clu_offset; sector_t sector; struct exfat_chain dir, clu; @@ -70,7 +70,7 @@ static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry) struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); - unsigned int dentry = ei->rwoffset & 0xFFFFFFFF; + unsigned int dentry = EXFAT_B_TO_DEN(*cpos) & 0xFFFFFFFF; struct buffer_head *bh; /* check if the given file ID is opened */ @@ -127,6 +127,7 @@ static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry) continue; } + num_ext = ep->dentry.file.num_ext; dir_entry->attr = le16_to_cpu(ep->dentry.file.attr); exfat_get_entry_time(sbi, &dir_entry->crtime, ep->dentry.file.create_tz, @@ -157,12 +158,13 @@ static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry) return -EIO; dir_entry->size = le64_to_cpu(ep->dentry.stream.valid_size); + dir_entry->entry = dentry; brelse(bh); ei->hint_bmap.off = dentry >> dentries_per_clu_bits; ei->hint_bmap.clu = clu.dir; - ei->rwoffset = ++dentry; + *cpos = EXFAT_DEN_TO_B(dentry + 1 + num_ext); return 0; } @@ -178,7 +180,7 @@ static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry) } dir_entry->namebuf.lfn[0] = '\0'; - ei->rwoffset = dentry; + *cpos = EXFAT_DEN_TO_B(dentry); return 0; } @@ -242,12 +244,10 @@ static int exfat_iterate(struct file *filp, struct dir_context *ctx) if (err) goto unlock; get_new: - ei->rwoffset = EXFAT_B_TO_DEN(cpos); - if (cpos >= i_size_read(inode)) goto end_of_dir; - err = exfat_readdir(inode, &de); + err = exfat_readdir(inode, &cpos, &de); if (err) { /* * At least we tried to read a sector. Move cpos to next sector @@ -262,13 +262,10 @@ get_new: goto end_of_dir; } - cpos = EXFAT_DEN_TO_B(ei->rwoffset); - if (!nb->lfn[0]) goto end_of_dir; - i_pos = ((loff_t)ei->start_clu << 32) | - ((ei->rwoffset - 1) & 0xffffffff); + i_pos = ((loff_t)ei->start_clu << 32) | (de.entry & 0xffffffff); tmp = exfat_iget(sb, i_pos); if (tmp) { inum = tmp->i_ino; @@ -911,7 +908,6 @@ enum { /* * return values: * >= 0 : return dir entiry position with the name in dir - * -EEXIST : (root dir, ".") it is the root dir itself * -ENOENT : entry with the name does not exist * -EIO : I/O error */ @@ -979,11 +975,8 @@ rewind: if (ei->hint_femp.eidx == EXFAT_HINT_NONE || candi_empty.eidx <= - ei->hint_femp.eidx) { - memcpy(&ei->hint_femp, - &candi_empty, - sizeof(candi_empty)); - } + ei->hint_femp.eidx) + ei->hint_femp = candi_empty; } brelse(bh); diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index c013fe931d9c..b8f0e829ecbd 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -128,7 +128,7 @@ enum { struct exfat_dentry_namebuf { char *lfn; - int lfnbuf_len; /* usally MAX_UNINAME_BUF_SIZE */ + int lfnbuf_len; /* usually MAX_UNINAME_BUF_SIZE */ }; /* unicode name structure */ @@ -265,8 +265,6 @@ struct exfat_inode_info { * the validation of hint_stat. */ unsigned int version; - /* file offset or dentry index for readdir */ - loff_t rwoffset; /* hint for cluster last accessed */ struct exfat_hint hint_bmap; diff --git a/fs/exfat/file.c b/fs/exfat/file.c index f41f523a58ad..a92478eabfa4 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -208,8 +208,6 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) /* hint information */ ei->hint_bmap.off = EXFAT_EOF_CLUSTER; ei->hint_bmap.clu = EXFAT_EOF_CLUSTER; - if (ei->rwoffset > new_size) - ei->rwoffset = new_size; /* hint_stat will be used if this is directory. */ ei->hint_stat.eidx = 0; @@ -229,7 +227,7 @@ void exfat_truncate(struct inode *inode, loff_t size) { struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); - unsigned int blocksize = 1 << inode->i_blkbits; + unsigned int blocksize = i_blocksize(inode); loff_t aligned_size; int err; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index a6de17cac3df..730373e0965a 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -114,8 +114,6 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset, unsigned int local_clu_offset = clu_offset; unsigned int num_to_be_allocated = 0, num_clusters = 0; - ei->rwoffset = EXFAT_CLU_TO_B(clu_offset, sbi); - if (EXFAT_I(inode)->i_size_ondisk > 0) num_clusters = EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk, @@ -556,7 +554,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) struct exfat_inode_info *ei = EXFAT_I(inode); loff_t size = info->size; - memcpy(&ei->dir, &info->dir, sizeof(struct exfat_chain)); + ei->dir = info->dir; ei->entry = info->entry; ei->attr = info->attr; ei->start_clu = info->start_clu; @@ -567,7 +565,6 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) ei->hint_stat.eidx = 0; ei->hint_stat.clu = info->start_clu; ei->hint_femp.eidx = EXFAT_HINT_NONE; - ei->rwoffset = 0; ei->hint_bmap.off = EXFAT_EOF_CLUSTER; ei->i_pos = 0; diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index c94ac239f740..2932b23a3b6c 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -290,7 +290,7 @@ static int exfat_check_max_dentries(struct inode *inode) { if (EXFAT_B_TO_DEN(i_size_read(inode)) >= MAX_EXFAT_DENTRIES) { /* - * exFAT spec allows a dir to grow upto 8388608(256MB) + * exFAT spec allows a dir to grow up to 8388608(256MB) * dentries */ return -ENOSPC; @@ -318,8 +318,7 @@ static int exfat_find_empty_entry(struct inode *inode, hint_femp.eidx = EXFAT_HINT_NONE; if (ei->hint_femp.eidx != EXFAT_HINT_NONE) { - memcpy(&hint_femp, &ei->hint_femp, - sizeof(struct exfat_hint_femp)); + hint_femp = ei->hint_femp; ei->hint_femp.eidx = EXFAT_HINT_NONE; } @@ -519,7 +518,7 @@ static int exfat_add_entry(struct inode *inode, const char *path, if (ret) goto out; - memcpy(&info->dir, p_dir, sizeof(struct exfat_chain)); + info->dir = *p_dir; info->entry = dentry; info->flags = ALLOC_NO_FAT_CHAIN; info->type = type; @@ -530,19 +529,10 @@ static int exfat_add_entry(struct inode *inode, const char *path, info->size = 0; info->num_subdirs = 0; } else { - int count; - struct exfat_chain cdir; - info->attr = ATTR_SUBDIR; info->start_clu = start_clu; info->size = clu_size; - - exfat_chain_set(&cdir, info->start_clu, - EXFAT_B_TO_CLU(info->size, sbi), info->flags); - count = exfat_count_dir_entries(sb, &cdir); - if (count < 0) - return -EIO; - info->num_subdirs = count + EXFAT_MIN_SUBDIR; + info->num_subdirs = EXFAT_MIN_SUBDIR; } memset(&info->crtime, 0, sizeof(info->crtime)); memset(&info->mtime, 0, sizeof(info->mtime)); @@ -604,6 +594,8 @@ static int exfat_find(struct inode *dir, struct qstr *qname, struct super_block *sb = dir->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(dir); + struct exfat_dentry *ep, *ep2; + struct exfat_entry_set_cache *es; if (qname->len == 0) return -ENOENT; @@ -629,91 +621,63 @@ static int exfat_find(struct inode *dir, struct qstr *qname, dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name, num_entries, TYPE_ALL); - if ((dentry < 0) && (dentry != -EEXIST)) + if (dentry < 0) return dentry; /* -error value */ - memcpy(&info->dir, &cdir.dir, sizeof(struct exfat_chain)); + info->dir = cdir; info->entry = dentry; info->num_subdirs = 0; - /* root directory itself */ - if (unlikely(dentry == -EEXIST)) { - int num_clu = 0; + es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES); + if (!es) + return -EIO; + ep = exfat_get_dentry_cached(es, 0); + ep2 = exfat_get_dentry_cached(es, 1); + + info->type = exfat_get_entry_type(ep); + info->attr = le16_to_cpu(ep->dentry.file.attr); + info->size = le64_to_cpu(ep2->dentry.stream.valid_size); + if ((info->type == TYPE_FILE) && (info->size == 0)) { + info->flags = ALLOC_NO_FAT_CHAIN; + info->start_clu = EXFAT_EOF_CLUSTER; + } else { + info->flags = ep2->dentry.stream.flags; + info->start_clu = + le32_to_cpu(ep2->dentry.stream.start_clu); + } - info->type = TYPE_DIR; - info->attr = ATTR_SUBDIR; - info->flags = ALLOC_FAT_CHAIN; - info->start_clu = sbi->root_dir; - memset(&info->crtime, 0, sizeof(info->crtime)); - memset(&info->mtime, 0, sizeof(info->mtime)); - memset(&info->atime, 0, sizeof(info->atime)); - - exfat_chain_set(&cdir, sbi->root_dir, 0, ALLOC_FAT_CHAIN); - if (exfat_count_num_clusters(sb, &cdir, &num_clu)) - return -EIO; - info->size = num_clu << sbi->cluster_size_bits; + exfat_get_entry_time(sbi, &info->crtime, + ep->dentry.file.create_tz, + ep->dentry.file.create_time, + ep->dentry.file.create_date, + ep->dentry.file.create_time_cs); + exfat_get_entry_time(sbi, &info->mtime, + ep->dentry.file.modify_tz, + ep->dentry.file.modify_time, + ep->dentry.file.modify_date, + ep->dentry.file.modify_time_cs); + exfat_get_entry_time(sbi, &info->atime, + ep->dentry.file.access_tz, + ep->dentry.file.access_time, + ep->dentry.file.access_date, + 0); + exfat_free_dentry_set(es, false); + + if (ei->start_clu == EXFAT_FREE_CLUSTER) { + exfat_fs_error(sb, + "non-zero size file starts with zero cluster (size : %llu, p_dir : %u, entry : 0x%08x)", + i_size_read(dir), ei->dir.dir, ei->entry); + return -EIO; + } + if (info->type == TYPE_DIR) { + exfat_chain_set(&cdir, info->start_clu, + EXFAT_B_TO_CLU(info->size, sbi), info->flags); count = exfat_count_dir_entries(sb, &cdir); if (count < 0) return -EIO; - info->num_subdirs = count; - } else { - struct exfat_dentry *ep, *ep2; - struct exfat_entry_set_cache *es; - - es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES); - if (!es) - return -EIO; - ep = exfat_get_dentry_cached(es, 0); - ep2 = exfat_get_dentry_cached(es, 1); - - info->type = exfat_get_entry_type(ep); - info->attr = le16_to_cpu(ep->dentry.file.attr); - info->size = le64_to_cpu(ep2->dentry.stream.valid_size); - if ((info->type == TYPE_FILE) && (info->size == 0)) { - info->flags = ALLOC_NO_FAT_CHAIN; - info->start_clu = EXFAT_EOF_CLUSTER; - } else { - info->flags = ep2->dentry.stream.flags; - info->start_clu = - le32_to_cpu(ep2->dentry.stream.start_clu); - } - - if (ei->start_clu == EXFAT_FREE_CLUSTER) { - exfat_fs_error(sb, - "non-zero size file starts with zero cluster (size : %llu, p_dir : %u, entry : 0x%08x)", - i_size_read(dir), ei->dir.dir, ei->entry); - exfat_free_dentry_set(es, false); - return -EIO; - } - - exfat_get_entry_time(sbi, &info->crtime, - ep->dentry.file.create_tz, - ep->dentry.file.create_time, - ep->dentry.file.create_date, - ep->dentry.file.create_time_cs); - exfat_get_entry_time(sbi, &info->mtime, - ep->dentry.file.modify_tz, - ep->dentry.file.modify_time, - ep->dentry.file.modify_date, - ep->dentry.file.modify_time_cs); - exfat_get_entry_time(sbi, &info->atime, - ep->dentry.file.access_tz, - ep->dentry.file.access_time, - ep->dentry.file.access_date, - 0); - exfat_free_dentry_set(es, false); - - if (info->type == TYPE_DIR) { - exfat_chain_set(&cdir, info->start_clu, - EXFAT_B_TO_CLU(info->size, sbi), info->flags); - count = exfat_count_dir_entries(sb, &cdir); - if (count < 0) - return -EIO; - - info->num_subdirs = count + EXFAT_MIN_SUBDIR; - } + info->num_subdirs = count + EXFAT_MIN_SUBDIR; } return 0; } @@ -1065,7 +1029,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, if (!epnew) return -EIO; - memcpy(epnew, epold, DENTRY_SIZE); + *epnew = *epold; if (exfat_get_entry_type(epnew) == TYPE_FILE) { epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE); ei->attr |= ATTR_ARCHIVE; @@ -1085,7 +1049,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, return -EIO; } - memcpy(epnew, epold, DENTRY_SIZE); + *epnew = *epold; exfat_update_bh(new_bh, sync); brelse(old_bh); brelse(new_bh); @@ -1130,11 +1094,6 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir, if (!epmov) return -EIO; - /* check if the source and target directory is the same */ - if (exfat_get_entry_type(epmov) == TYPE_DIR && - le32_to_cpu(epmov->dentry.stream.start_clu) == p_newdir->dir) - return -EINVAL; - num_old_entries = exfat_count_ext_entries(sb, p_olddir, oldentry, epmov); if (num_old_entries < 0) @@ -1153,7 +1112,7 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir, if (!epnew) return -EIO; - memcpy(epnew, epmov, DENTRY_SIZE); + *epnew = *epmov; if (exfat_get_entry_type(epnew) == TYPE_FILE) { epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE); ei->attr |= ATTR_ARCHIVE; @@ -1173,7 +1132,7 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir, return -EIO; } - memcpy(epnew, epmov, DENTRY_SIZE); + *epnew = *epmov; exfat_update_bh(new_bh, IS_DIRSYNC(inode)); brelse(mov_bh); brelse(new_bh); diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index a3c927501e67..675d0e7058c5 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -11,7 +11,7 @@ #include "exfat_raw.h" #include "exfat_fs.h" -/* Upcase tabel macro */ +/* Upcase table macro */ #define EXFAT_NUM_UPCASE (2918) #define UTBL_COUNT (0x10000) diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 60b941ba557b..87be5bfc31eb 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -89,8 +89,7 @@ static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = sbi->num_clusters - 2; /* clu 0 & 1 */ buf->f_bfree = buf->f_blocks - sbi->used_clusters; buf->f_bavail = buf->f_bfree; - buf->f_fsid.val[0] = (unsigned int)id; - buf->f_fsid.val[1] = (unsigned int)(id >> 32); + buf->f_fsid = u64_to_fsid(id); /* Unicode utf16 255 characters */ buf->f_namelen = EXFAT_MAX_FILE_LEN * NLS_MAX_CHARSET_SIZE; return 0; @@ -342,7 +341,6 @@ static int exfat_read_root(struct inode *inode) ei->flags = ALLOC_FAT_CHAIN; ei->type = TYPE_DIR; ei->version = 0; - ei->rwoffset = 0; ei->hint_bmap.off = EXFAT_EOF_CLUSTER; ei->hint_stat.eidx = 0; ei->hint_stat.clu = sbi->root_dir; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 7fab2b3b5b39..09f1fe676972 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1455,8 +1455,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) buf->f_namelen = EXT2_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); - buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; - buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + buf->f_fsid = u64_to_fsid(fsid); spin_unlock(&sbi->s_lock); return 0; } diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 2e42f47a7f98..49e7af6cc93f 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -10,7 +10,7 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \ - xattr_user.o + xattr_user.o fast_commit.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 76f634d185f1..68aaed48315f 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -242,6 +242,7 @@ retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) return PTR_ERR(handle); + ext4_fc_start_update(inode); if ((type == ACL_TYPE_ACCESS) && acl) { error = posix_acl_update_mode(inode, &mode, &acl); @@ -259,6 +260,7 @@ retry: } out_stop: ext4_journal_stop(handle); + ext4_fc_stop_update(inode); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return error; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 48c3df47748d..1d640b145637 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -368,7 +368,12 @@ static int ext4_validate_block_bitmap(struct super_block *sb, struct buffer_head *bh) { ext4_fsblk_t blk; - struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); + struct ext4_group_info *grp; + + if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) + return 0; + + grp = ext4_get_group_info(sb, block_group); if (buffer_verified(bh)) return 0; @@ -495,10 +500,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, */ set_buffer_new(bh); trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked); - bh->b_end_io = ext4_end_bitmap_read; - get_bh(bh); - submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO | - (ignore_locked ? REQ_RAHEAD : 0), bh); + ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO | + (ignore_locked ? REQ_RAHEAD : 0), + ext4_end_bitmap_read); return bh; verify: err = ext4_validate_block_bitmap(sb, desc, block_group, bh); diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index c54ba52f2dd4..8e6ca23ed172 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -131,7 +131,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi) printk(KERN_INFO "System zones: "); rcu_read_lock(); - system_blks = rcu_dereference(sbi->system_blks); + system_blks = rcu_dereference(sbi->s_system_blks); node = rb_first(&system_blks->root); while (node) { entry = rb_entry(node, struct ext4_system_zone, node); @@ -261,7 +261,7 @@ int ext4_setup_system_zone(struct super_block *sb) * with ext4_data_block_valid() accessing the rbtree at the same * time. */ - rcu_assign_pointer(sbi->system_blks, system_blks); + rcu_assign_pointer(sbi->s_system_blks, system_blks); if (test_opt(sb, DEBUG)) debug_print_tree(sbi); @@ -286,9 +286,9 @@ void ext4_release_system_zone(struct super_block *sb) { struct ext4_system_blocks *system_blks; - system_blks = rcu_dereference_protected(EXT4_SB(sb)->system_blks, + system_blks = rcu_dereference_protected(EXT4_SB(sb)->s_system_blks, lockdep_is_held(&sb->s_umount)); - rcu_assign_pointer(EXT4_SB(sb)->system_blks, NULL); + rcu_assign_pointer(EXT4_SB(sb)->s_system_blks, NULL); if (system_blks) call_rcu(&system_blks->rcu, ext4_destroy_system_zone); @@ -319,7 +319,7 @@ int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, * mount option. */ rcu_read_lock(); - system_blks = rcu_dereference(sbi->system_blks); + system_blks = rcu_dereference(sbi->s_system_blks); if (system_blks == NULL) goto out_rcu; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index efe77cffc322..5b81f3b080ee 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -674,7 +674,7 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len, { struct qstr qstr = {.name = str, .len = len }; const struct dentry *parent = READ_ONCE(dentry->d_parent); - const struct inode *inode = READ_ONCE(parent->d_inode); + const struct inode *inode = d_inode_rcu(parent); char strbuf[DNAME_INLINE_LEN]; if (!inode || !IS_CASEFOLDED(inode) || @@ -706,7 +706,7 @@ static int ext4_d_hash(const struct dentry *dentry, struct qstr *str) { const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb); const struct unicode_map *um = sbi->s_encoding; - const struct inode *inode = READ_ONCE(dentry->d_inode); + const struct inode *inode = d_inode_rcu(dentry); unsigned char *norm; int len, ret = 0; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f9a692c0a66c..254d1c26bea8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -27,7 +27,6 @@ #include <linux/seqlock.h> #include <linux/mutex.h> #include <linux/timer.h> -#include <linux/version.h> #include <linux/wait.h> #include <linux/sched/signal.h> #include <linux/blockgroup_lock.h> @@ -492,7 +491,7 @@ struct flex_groups { /* Flags which are mutually exclusive to DAX */ #define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\ - EXT4_JOURNAL_DATA_FL) + EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL) /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) @@ -964,6 +963,7 @@ do { \ #endif /* defined(__KERNEL__) || defined(__linux__) */ #include "extents_status.h" +#include "fast_commit.h" /* * Lock subclasses for i_data_sem in the ext4_inode_info structure. @@ -1021,6 +1021,31 @@ struct ext4_inode_info { struct list_head i_orphan; /* unlinked but open inodes */ + /* Fast commit related info */ + + struct list_head i_fc_list; /* + * inodes that need fast commit + * protected by sbi->s_fc_lock. + */ + + /* Fast commit subtid when this inode was committed */ + unsigned int i_fc_committed_subtid; + + /* Start of lblk range that needs to be committed in this fast commit */ + ext4_lblk_t i_fc_lblk_start; + + /* End of lblk range that needs to be committed in this fast commit */ + ext4_lblk_t i_fc_lblk_len; + + /* Number of ongoing updates on this inode */ + atomic_t i_fc_updates; + + /* Fast commit wait queue for this inode */ + wait_queue_head_t i_fc_wait; + + /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */ + struct mutex i_fc_lock; + /* * i_disksize keeps track of what the inode size is ON DISK, not * in memory. During truncate, i_size is set to the new size by @@ -1141,6 +1166,11 @@ struct ext4_inode_info { #define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ #define EXT4_ERROR_FS 0x0002 /* Errors detected */ #define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ +#define EXT4_FC_INELIGIBLE 0x0008 /* Fast commit ineligible */ +#define EXT4_FC_COMMITTING 0x0010 /* File system underoing a fast + * commit. + */ +#define EXT4_FC_REPLAY 0x0020 /* Fast commit replay ongoing */ /* * Misc. filesystem flags @@ -1214,6 +1244,8 @@ struct ext4_inode_info { #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly specified journal checksum */ +#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */ + #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt #define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ @@ -1481,14 +1513,14 @@ struct ext4_sb_info { unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; - struct block_device *journal_bdev; + struct block_device *s_journal_bdev; #ifdef CONFIG_QUOTA /* Names of quota files with journalled quota */ char __rcu *s_qf_names[EXT4_MAXQUOTAS]; int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ - struct ext4_system_blocks __rcu *system_blks; + struct ext4_system_blocks __rcu *s_system_blks; #ifdef EXTENTS_STATS /* ext4 extents stats */ @@ -1611,6 +1643,34 @@ struct ext4_sb_info { /* Record the errseq of the backing block device */ errseq_t s_bdev_wb_err; spinlock_t s_bdev_wb_lock; + + /* Ext4 fast commit stuff */ + atomic_t s_fc_subtid; + atomic_t s_fc_ineligible_updates; + /* + * After commit starts, the main queue gets locked, and the further + * updates get added in the staging queue. + */ +#define FC_Q_MAIN 0 +#define FC_Q_STAGING 1 + struct list_head s_fc_q[2]; /* Inodes staged for fast commit + * that have data changes in them. + */ + struct list_head s_fc_dentry_q[2]; /* directory entry updates */ + unsigned int s_fc_bytes; + /* + * Main fast commit lock. This lock protects accesses to the + * following fields: + * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh. + */ + spinlock_t s_fc_lock; + struct buffer_head *s_fc_bh; + struct ext4_fc_stats s_fc_stats; + u64 s_fc_avg_commit_time; +#ifdef CONFIG_EXT4_DEBUG + int s_fc_debug_max_replay; +#endif + struct ext4_fc_replay_state s_fc_replay_state; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1721,6 +1781,7 @@ enum { EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ + EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1814,6 +1875,7 @@ static inline bool ext4_verity_in_progress(struct inode *inode) #define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 #define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 #define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 +#define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400 #define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800 #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 @@ -1916,6 +1978,7 @@ EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR) EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE) EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) +EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT) EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES) EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) @@ -2650,6 +2713,7 @@ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, struct dx_hash_info *hinfo); /* ialloc.c */ +extern int ext4_mark_inode_used(struct super_block *sb, int ino); extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, @@ -2675,6 +2739,27 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier); extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); +/* fast_commit.c */ +int ext4_fc_info_show(struct seq_file *seq, void *v); +void ext4_fc_init(struct super_block *sb, journal_t *journal); +void ext4_fc_init_inode(struct inode *inode); +void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); +void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry); +void ext4_fc_track_link(struct inode *inode, struct dentry *dentry); +void ext4_fc_track_create(struct inode *inode, struct dentry *dentry); +void ext4_fc_track_inode(struct inode *inode); +void ext4_fc_mark_ineligible(struct super_block *sb, int reason); +void ext4_fc_start_ineligible(struct super_block *sb, int reason); +void ext4_fc_stop_ineligible(struct super_block *sb); +void ext4_fc_start_update(struct inode *inode); +void ext4_fc_stop_update(struct inode *inode); +void ext4_fc_del(struct inode *inode); +bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block); +void ext4_fc_replay_cleanup(struct super_block *sb); +int ext4_fc_commit(journal_t *journal, tid_t commit_tid); +int __init ext4_fc_init_dentry_cache(void); + /* mballoc.c */ extern const struct seq_operations ext4_mb_seq_groups_ops; extern long ext4_mb_stats; @@ -2704,8 +2789,12 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); +extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, + int len, int state); /* inode.c */ +void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei); int ext4_inode_is_fast_symlink(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); @@ -2752,6 +2841,8 @@ extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, + struct ext4_iloc *iloc); extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); @@ -2785,12 +2876,15 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); +extern void ext4_reset_inode_seed(struct inode *inode); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); extern int ext4_ind_migrate(struct inode *inode); /* namei.c */ +extern int ext4_init_new_dir(handle_t *handle, struct inode *dir, + struct inode *inode); extern int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh); extern int ext4_orphan_add(handle_t *, struct inode *); @@ -2824,6 +2918,14 @@ extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags); +extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, + sector_t block); +extern void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags, + bh_end_io_t *end_io); +extern int ext4_read_bh(struct buffer_head *bh, int op_flags, + bh_end_io_t *end_io); +extern int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait); +extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); extern void ext4_superblock_csum_set(struct super_block *sb); @@ -3012,22 +3114,24 @@ static inline int ext4_has_group_desc_csum(struct super_block *sb) return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); } +#define ext4_read_incompat_64bit_val(es, name) \ + (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \ + ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \ + le32_to_cpu(es->name##_lo)) + static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { - return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | - le32_to_cpu(es->s_blocks_count_lo); + return ext4_read_incompat_64bit_val(es, s_blocks_count); } static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) { - return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) | - le32_to_cpu(es->s_r_blocks_count_lo); + return ext4_read_incompat_64bit_val(es, s_r_blocks_count); } static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) { - return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) | - le32_to_cpu(es->s_free_blocks_count_lo); + return ext4_read_incompat_64bit_val(es, s_free_blocks_count); } static inline void ext4_blocks_count_set(struct ext4_super_block *es, @@ -3153,6 +3257,9 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, struct ext4_group_info { unsigned long bb_state; +#ifdef AGGRESSIVE_CHECK + unsigned long bb_check_counter; +#endif struct rb_root bb_free_root; ext4_grpblk_t bb_first_free; /* first free block */ ext4_grpblk_t bb_free; /* total free blocks */ @@ -3357,6 +3464,10 @@ extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, extern int ext4_ci_compare(const struct inode *parent, const struct qstr *fname, const struct qstr *entry, bool quick); +extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, + struct inode *inode); +extern int __ext4_link(struct inode *dir, struct inode *inode, + struct dentry *dentry); #define S_SHIFT 12 static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { @@ -3457,6 +3568,11 @@ extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, int check_cred, int restart_cred, int revoke_cred); +extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end); +extern int ext4_ext_replay_set_iblocks(struct inode *inode); +extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, + int len, int unwritten, ext4_fsblk_t pblk); +extern int ext4_ext_clear_bb(struct inode *inode); /* move_extent.c */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 760b9ee49dc0..0fd0c42a4f7d 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -100,7 +100,7 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, return ERR_PTR(err); journal = EXT4_SB(sb)->s_journal; - if (!journal) + if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return ext4_get_nojournal(); return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds, GFP_NOFS, type, line); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a0481582187a..559100f3e23c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -501,7 +501,7 @@ __read_extent_tree_block(const char *function, unsigned int line, if (!bh_uptodate_or_lock(bh)) { trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); - err = bh_submit_read(bh); + err = ext4_read_bh(bh, 0, NULL); if (err < 0) goto errout; } @@ -3723,6 +3723,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, err = ext4_ext_dirty(handle, inode, path + path->p_depth); out: ext4_ext_show_leaf(inode, path); + ext4_fc_track_range(inode, ee_block, ee_block + ee_len - 1); return err; } @@ -3794,6 +3795,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, if (*allocated > map->m_len) *allocated = map->m_len; map->m_len = *allocated; + ext4_fc_track_range(inode, ee_block, ee_block + ee_len - 1); return 0; } @@ -4023,7 +4025,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) * - * return > 0, number of of blocks already mapped/allocated + * return > 0, number of blocks already mapped/allocated * if create == 0 and these are pre-allocated blocks * buffer head is unmapped * otherwise blocks are mapped @@ -4327,7 +4329,7 @@ got_allocated_blocks: map->m_len = ar.len; allocated = map->m_len; ext4_ext_show_leaf(inode, path); - + ext4_fc_track_range(inode, map->m_lblk, map->m_lblk + map->m_len - 1); out: ext4_ext_drop_refs(path); kfree(path); @@ -4600,7 +4602,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; - + ext4_fc_track_range(inode, offset >> inode->i_sb->s_blocksize_bits, + (offset + len - 1) >> inode->i_sb->s_blocksize_bits); /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); if (ret >= 0) @@ -4648,23 +4651,34 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; + ext4_fc_track_range(inode, offset >> blkbits, + (offset + len - 1) >> blkbits); - if (mode & FALLOC_FL_PUNCH_HOLE) - return ext4_punch_hole(inode, offset, len); + ext4_fc_start_update(inode); + + if (mode & FALLOC_FL_PUNCH_HOLE) { + ret = ext4_punch_hole(inode, offset, len); + goto exit; + } ret = ext4_convert_inline_data(inode); if (ret) - return ret; + goto exit; - if (mode & FALLOC_FL_COLLAPSE_RANGE) - return ext4_collapse_range(inode, offset, len); - - if (mode & FALLOC_FL_INSERT_RANGE) - return ext4_insert_range(inode, offset, len); + if (mode & FALLOC_FL_COLLAPSE_RANGE) { + ret = ext4_collapse_range(inode, offset, len); + goto exit; + } - if (mode & FALLOC_FL_ZERO_RANGE) - return ext4_zero_range(file, offset, len, mode); + if (mode & FALLOC_FL_INSERT_RANGE) { + ret = ext4_insert_range(inode, offset, len); + goto exit; + } + if (mode & FALLOC_FL_ZERO_RANGE) { + ret = ext4_zero_range(file, offset, len, mode); + goto exit; + } trace_ext4_fallocate_enter(inode, offset, len, mode); lblk = offset >> blkbits; @@ -4698,12 +4712,14 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto out; if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { - ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, - EXT4_I(inode)->i_sync_tid); + ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); } out: inode_unlock(inode); trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); +exit: + ext4_fc_stop_update(inode); return ret; } @@ -4769,7 +4785,7 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end) { - int ret, err = 0; + int ret = 0, err = 0; struct ext4_io_end_vec *io_end_vec; /* @@ -5291,6 +5307,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } + ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); @@ -5329,6 +5346,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); + ext4_fc_stop_ineligible(sb); out_mmap: up_write(&EXT4_I(inode)->i_mmap_sem); out_mutex: @@ -5429,6 +5447,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } + ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; @@ -5503,6 +5522,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); + ext4_fc_stop_ineligible(sb); out_mmap: up_write(&EXT4_I(inode)->i_mmap_sem); out_mutex: @@ -5784,3 +5804,264 @@ out: return err ? err : mapped; } + +/* + * Updates physical block address and unwritten status of extent + * starting at lblk start and of len. If such an extent doesn't exist, + * this function splits the extent tree appropriately to create an + * extent like this. This function is called in the fast commit + * replay path. Returns 0 on success and error on failure. + */ +int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, + int len, int unwritten, ext4_fsblk_t pblk) +{ + struct ext4_ext_path *path = NULL, *ppath; + struct ext4_extent *ex; + int ret; + + path = ext4_find_extent(inode, start, NULL, 0); + if (!path) + return -EINVAL; + ex = path[path->p_depth].p_ext; + if (!ex) { + ret = -EFSCORRUPTED; + goto out; + } + + if (le32_to_cpu(ex->ee_block) != start || + ext4_ext_get_actual_len(ex) != len) { + /* We need to split this extent to match our extent first */ + ppath = path; + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1); + up_write(&EXT4_I(inode)->i_data_sem); + if (ret) + goto out; + kfree(path); + path = ext4_find_extent(inode, start, NULL, 0); + if (IS_ERR(path)) + return -1; + ppath = path; + ex = path[path->p_depth].p_ext; + WARN_ON(le32_to_cpu(ex->ee_block) != start); + if (ext4_ext_get_actual_len(ex) != len) { + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_force_split_extent_at(NULL, inode, &ppath, + start + len, 1); + up_write(&EXT4_I(inode)->i_data_sem); + if (ret) + goto out; + kfree(path); + path = ext4_find_extent(inode, start, NULL, 0); + if (IS_ERR(path)) + return -EINVAL; + ex = path[path->p_depth].p_ext; + } + } + if (unwritten) + ext4_ext_mark_unwritten(ex); + else + ext4_ext_mark_initialized(ex); + ext4_ext_store_pblock(ex, pblk); + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]); + up_write(&EXT4_I(inode)->i_data_sem); +out: + ext4_ext_drop_refs(path); + kfree(path); + ext4_mark_inode_dirty(NULL, inode); + return ret; +} + +/* Try to shrink the extent tree */ +void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ex; + ext4_lblk_t old_cur, cur = 0; + + while (cur < end) { + path = ext4_find_extent(inode, cur, NULL, 0); + if (IS_ERR(path)) + return; + ex = path[path->p_depth].p_ext; + if (!ex) { + ext4_ext_drop_refs(path); + kfree(path); + ext4_mark_inode_dirty(NULL, inode); + return; + } + old_cur = cur; + cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); + if (cur <= old_cur) + cur = old_cur + 1; + ext4_ext_try_to_merge(NULL, inode, path, ex); + down_write(&EXT4_I(inode)->i_data_sem); + ext4_ext_dirty(NULL, inode, &path[path->p_depth]); + up_write(&EXT4_I(inode)->i_data_sem); + ext4_mark_inode_dirty(NULL, inode); + ext4_ext_drop_refs(path); + kfree(path); + } +} + +/* Check if *cur is a hole and if it is, skip it */ +static void skip_hole(struct inode *inode, ext4_lblk_t *cur) +{ + int ret; + struct ext4_map_blocks map; + + map.m_lblk = *cur; + map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur; + + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret != 0) + return; + *cur = *cur + map.m_len; +} + +/* Count number of blocks used by this inode and update i_blocks */ +int ext4_ext_replay_set_iblocks(struct inode *inode) +{ + struct ext4_ext_path *path = NULL, *path2 = NULL; + struct ext4_extent *ex; + ext4_lblk_t cur = 0, end; + int numblks = 0, i, ret = 0; + ext4_fsblk_t cmp1, cmp2; + struct ext4_map_blocks map; + + /* Determin the size of the file first */ + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, + EXT4_EX_NOCACHE); + if (IS_ERR(path)) + return PTR_ERR(path); + ex = path[path->p_depth].p_ext; + if (!ex) { + ext4_ext_drop_refs(path); + kfree(path); + goto out; + } + end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); + ext4_ext_drop_refs(path); + kfree(path); + + /* Count the number of data blocks */ + cur = 0; + while (cur < end) { + map.m_lblk = cur; + map.m_len = end - cur; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + break; + if (ret > 0) + numblks += ret; + cur = cur + map.m_len; + } + + /* + * Count the number of extent tree blocks. We do it by looking up + * two successive extents and determining the difference between + * their paths. When path is different for 2 successive extents + * we compare the blocks in the path at each level and increment + * iblocks by total number of differences found. + */ + cur = 0; + skip_hole(inode, &cur); + path = ext4_find_extent(inode, cur, NULL, 0); + if (IS_ERR(path)) + goto out; + numblks += path->p_depth; + ext4_ext_drop_refs(path); + kfree(path); + while (cur < end) { + path = ext4_find_extent(inode, cur, NULL, 0); + if (IS_ERR(path)) + break; + ex = path[path->p_depth].p_ext; + if (!ex) { + ext4_ext_drop_refs(path); + kfree(path); + return 0; + } + cur = max(cur + 1, le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)); + skip_hole(inode, &cur); + + path2 = ext4_find_extent(inode, cur, NULL, 0); + if (IS_ERR(path2)) { + ext4_ext_drop_refs(path); + kfree(path); + break; + } + ex = path2[path2->p_depth].p_ext; + for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { + cmp1 = cmp2 = 0; + if (i <= path->p_depth) + cmp1 = path[i].p_bh ? + path[i].p_bh->b_blocknr : 0; + if (i <= path2->p_depth) + cmp2 = path2[i].p_bh ? + path2[i].p_bh->b_blocknr : 0; + if (cmp1 != cmp2 && cmp2 != 0) + numblks++; + } + ext4_ext_drop_refs(path); + ext4_ext_drop_refs(path2); + kfree(path); + kfree(path2); + } + +out: + inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9); + ext4_mark_inode_dirty(NULL, inode); + return 0; +} + +int ext4_ext_clear_bb(struct inode *inode) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ex; + ext4_lblk_t cur = 0, end; + int j, ret = 0; + struct ext4_map_blocks map; + + /* Determin the size of the file first */ + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, + EXT4_EX_NOCACHE); + if (IS_ERR(path)) + return PTR_ERR(path); + ex = path[path->p_depth].p_ext; + if (!ex) { + ext4_ext_drop_refs(path); + kfree(path); + return 0; + } + end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); + ext4_ext_drop_refs(path); + kfree(path); + + cur = 0; + while (cur < end) { + map.m_lblk = cur; + map.m_len = end - cur; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + break; + if (ret > 0) { + path = ext4_find_extent(inode, map.m_lblk, NULL, 0); + if (!IS_ERR_OR_NULL(path)) { + for (j = 0; j < path->p_depth; j++) { + + ext4_mb_mark_bb(inode->i_sb, + path[j].p_block, 1, 0); + } + ext4_ext_drop_refs(path); + kfree(path); + } + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + } + cur = cur + map.m_len; + } + + return 0; +} diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index e75171535375..0a729027322d 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -311,6 +311,9 @@ void ext4_es_find_extent_range(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es) { + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; + trace_ext4_es_find_extent_range_enter(inode, lblk); read_lock(&EXT4_I(inode)->i_es_lock); @@ -361,6 +364,9 @@ bool ext4_es_scan_range(struct inode *inode, { bool ret; + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return false; + read_lock(&EXT4_I(inode)->i_es_lock); ret = __es_scan_range(inode, matching_fn, lblk, end); read_unlock(&EXT4_I(inode)->i_es_lock); @@ -404,6 +410,9 @@ bool ext4_es_scan_clu(struct inode *inode, { bool ret; + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return false; + read_lock(&EXT4_I(inode)->i_es_lock); ret = __es_scan_clu(inode, matching_fn, lblk); read_unlock(&EXT4_I(inode)->i_es_lock); @@ -812,6 +821,9 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, int err = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return 0; + es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", lblk, len, pblk, status, inode->i_ino); @@ -873,6 +885,9 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status newes; ext4_lblk_t end = lblk + len - 1; + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; + newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, pblk, status); @@ -908,6 +923,9 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct rb_node *node; int found = 0; + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return 0; + trace_ext4_es_lookup_extent_enter(inode, lblk); es_debug("lookup extent in block %u\n", lblk); @@ -1419,6 +1437,9 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, int err = 0; int reserved = 0; + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return 0; + trace_ext4_es_remove_extent(inode, lblk, len); es_debug("remove [%u/%u) from extent status tree of inode %lu\n", lblk, len, inode->i_ino); @@ -1969,6 +1990,9 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, struct extent_status newes; int err = 0; + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return 0; + es_debug("add [%u/1) delayed to extent status tree of inode %lu\n", lblk, inode->i_ino); diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c new file mode 100644 index 000000000000..447c8d93f480 --- /dev/null +++ b/fs/ext4/fast_commit.c @@ -0,0 +1,2139 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * fs/ext4/fast_commit.c + * + * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> + * + * Ext4 fast commits routines. + */ +#include "ext4.h" +#include "ext4_jbd2.h" +#include "ext4_extents.h" +#include "mballoc.h" + +/* + * Ext4 Fast Commits + * ----------------- + * + * Ext4 fast commits implement fine grained journalling for Ext4. + * + * Fast commits are organized as a log of tag-length-value (TLV) structs. (See + * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by + * TLV during the recovery phase. For the scenarios for which we currently + * don't have replay code, fast commit falls back to full commits. + * Fast commits record delta in one of the following three categories. + * + * (A) Directory entry updates: + * + * - EXT4_FC_TAG_UNLINK - records directory entry unlink + * - EXT4_FC_TAG_LINK - records directory entry link + * - EXT4_FC_TAG_CREAT - records inode and directory entry creation + * + * (B) File specific data range updates: + * + * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode + * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode + * + * (C) Inode metadata (mtime / ctime etc): + * + * - EXT4_FC_TAG_INODE - record the inode that should be replayed + * during recovery. Note that iblocks field is + * not replayed and instead derived during + * replay. + * Commit Operation + * ---------------- + * With fast commits, we maintain all the directory entry operations in the + * order in which they are issued in an in-memory queue. This queue is flushed + * to disk during the commit operation. We also maintain a list of inodes + * that need to be committed during a fast commit in another in memory queue of + * inodes. During the commit operation, we commit in the following order: + * + * [1] Lock inodes for any further data updates by setting COMMITTING state + * [2] Submit data buffers of all the inodes + * [3] Wait for [2] to complete + * [4] Commit all the directory entry updates in the fast commit space + * [5] Commit all the changed inode structures + * [6] Write tail tag (this tag ensures the atomicity, please read the following + * section for more details). + * [7] Wait for [4], [5] and [6] to complete. + * + * All the inode updates must call ext4_fc_start_update() before starting an + * update. If such an ongoing update is present, fast commit waits for it to + * complete. The completion of such an update is marked by + * ext4_fc_stop_update(). + * + * Fast Commit Ineligibility + * ------------------------- + * Not all operations are supported by fast commits today (e.g extended + * attributes). Fast commit ineligiblity is marked by calling one of the + * two following functions: + * + * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall + * back to full commit. This is useful in case of transient errors. + * + * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all + * the fast commits happening between ext4_fc_start_ineligible() and + * ext4_fc_stop_ineligible() and one fast commit after the call to + * ext4_fc_stop_ineligible() to fall back to full commits. It is important to + * make one more fast commit to fall back to full commit after stop call so + * that it guaranteed that the fast commit ineligible operation contained + * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is + * followed by at least 1 full commit. + * + * Atomicity of commits + * -------------------- + * In order to gaurantee atomicity during the commit operation, fast commit + * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail + * tag contains CRC of the contents and TID of the transaction after which + * this fast commit should be applied. Recovery code replays fast commit + * logs only if there's at least 1 valid tail present. For every fast commit + * operation, there is 1 tail. This means, we may end up with multiple tails + * in the fast commit space. Here's an example: + * + * - Create a new file A and remove existing file B + * - fsync() + * - Append contents to file A + * - Truncate file A + * - fsync() + * + * The fast commit space at the end of above operations would look like this: + * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] + * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| + * + * Replay code should thus check for all the valid tails in the FC area. + * + * TODOs + * ----- + * 1) Make fast commit atomic updates more fine grained. Today, a fast commit + * eligible update must be protected within ext4_fc_start_update() and + * ext4_fc_stop_update(). These routines are called at much higher + * routines. This can be made more fine grained by combining with + * ext4_journal_start(). + * + * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() + * + * 3) Handle more ineligible cases. + */ + +#include <trace/events/ext4.h> +static struct kmem_cache *ext4_fc_dentry_cachep; + +static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) +{ + BUFFER_TRACE(bh, ""); + if (uptodate) { + ext4_debug("%s: Block %lld up-to-date", + __func__, bh->b_blocknr); + set_buffer_uptodate(bh); + } else { + ext4_debug("%s: Block %lld not up-to-date", + __func__, bh->b_blocknr); + clear_buffer_uptodate(bh); + } + + unlock_buffer(bh); +} + +static inline void ext4_fc_reset_inode(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + ei->i_fc_lblk_start = 0; + ei->i_fc_lblk_len = 0; +} + +void ext4_fc_init_inode(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + ext4_fc_reset_inode(inode); + ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); + INIT_LIST_HEAD(&ei->i_fc_list); + init_waitqueue_head(&ei->i_fc_wait); + atomic_set(&ei->i_fc_updates, 0); + ei->i_fc_committed_subtid = 0; +} + +/* + * Inform Ext4's fast about start of an inode update + * + * This function is called by the high level call VFS callbacks before + * performing any inode update. This function blocks if there's an ongoing + * fast commit on the inode in question. + */ +void ext4_fc_start_update(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || + (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) + return; + +restart: + spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); + if (list_empty(&ei->i_fc_list)) + goto out; + + if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { + wait_queue_head_t *wq; +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_COMMITTING); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); + schedule(); + finish_wait(wq, &wait.wq_entry); + goto restart; + } +out: + atomic_inc(&ei->i_fc_updates); + spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); +} + +/* + * Stop inode update and wake up waiting fast commits if any. + */ +void ext4_fc_stop_update(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || + (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) + return; + + if (atomic_dec_and_test(&ei->i_fc_updates)) + wake_up_all(&ei->i_fc_wait); +} + +/* + * Remove inode from fast commit list. If the inode is being committed + * we wait until inode commit is done. + */ +void ext4_fc_del(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || + (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) + return; + +restart: + spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); + if (list_empty(&ei->i_fc_list)) { + spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); + return; + } + + if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { + wait_queue_head_t *wq; +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_COMMITTING); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); + schedule(); + finish_wait(wq, &wait.wq_entry); + goto restart; + } + if (!list_empty(&ei->i_fc_list)) + list_del_init(&ei->i_fc_list); + spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); +} + +/* + * Mark file system as fast commit ineligible. This means that next commit + * operation would result in a full jbd2 commit. + */ +void ext4_fc_mark_ineligible(struct super_block *sb, int reason) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || + (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) + return; + + sbi->s_mount_state |= EXT4_FC_INELIGIBLE; + WARN_ON(reason >= EXT4_FC_REASON_MAX); + sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; +} + +/* + * Start a fast commit ineligible update. Any commits that happen while + * such an operation is in progress fall back to full commits. + */ +void ext4_fc_start_ineligible(struct super_block *sb, int reason) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || + (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) + return; + + WARN_ON(reason >= EXT4_FC_REASON_MAX); + sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; + atomic_inc(&sbi->s_fc_ineligible_updates); +} + +/* + * Stop a fast commit ineligible update. We set EXT4_FC_INELIGIBLE flag here + * to ensure that after stopping the ineligible update, at least one full + * commit takes place. + */ +void ext4_fc_stop_ineligible(struct super_block *sb) +{ + if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || + (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) + return; + + EXT4_SB(sb)->s_mount_state |= EXT4_FC_INELIGIBLE; + atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); +} + +static inline int ext4_fc_is_ineligible(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_mount_state & EXT4_FC_INELIGIBLE) || + atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates); +} + +/* + * Generic fast commit tracking function. If this is the first time this we are + * called after a full commit, we initialize fast commit fields and then call + * __fc_track_fn() with update = 0. If we have already been called after a full + * commit, we pass update = 1. Based on that, the track function can determine + * if it needs to track a field for the first time or if it needs to just + * update the previously tracked value. + * + * If enqueue is set, this function enqueues the inode in fast commit list. + */ +static int ext4_fc_track_template( + struct inode *inode, int (*__fc_track_fn)(struct inode *, void *, bool), + void *args, int enqueue) +{ + tid_t running_txn_tid; + bool update = false; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int ret; + + if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || + (sbi->s_mount_state & EXT4_FC_REPLAY)) + return -EOPNOTSUPP; + + if (ext4_fc_is_ineligible(inode->i_sb)) + return -EINVAL; + + running_txn_tid = sbi->s_journal ? + sbi->s_journal->j_commit_sequence + 1 : 0; + + mutex_lock(&ei->i_fc_lock); + if (running_txn_tid == ei->i_sync_tid) { + update = true; + } else { + ext4_fc_reset_inode(inode); + ei->i_sync_tid = running_txn_tid; + } + ret = __fc_track_fn(inode, args, update); + mutex_unlock(&ei->i_fc_lock); + + if (!enqueue) + return ret; + + spin_lock(&sbi->s_fc_lock); + if (list_empty(&EXT4_I(inode)->i_fc_list)) + list_add_tail(&EXT4_I(inode)->i_fc_list, + (sbi->s_mount_state & EXT4_FC_COMMITTING) ? + &sbi->s_fc_q[FC_Q_STAGING] : + &sbi->s_fc_q[FC_Q_MAIN]); + spin_unlock(&sbi->s_fc_lock); + + return ret; +} + +struct __track_dentry_update_args { + struct dentry *dentry; + int op; +}; + +/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ +static int __track_dentry_update(struct inode *inode, void *arg, bool update) +{ + struct ext4_fc_dentry_update *node; + struct ext4_inode_info *ei = EXT4_I(inode); + struct __track_dentry_update_args *dentry_update = + (struct __track_dentry_update_args *)arg; + struct dentry *dentry = dentry_update->dentry; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + mutex_unlock(&ei->i_fc_lock); + node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); + if (!node) { + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MEM); + mutex_lock(&ei->i_fc_lock); + return -ENOMEM; + } + + node->fcd_op = dentry_update->op; + node->fcd_parent = dentry->d_parent->d_inode->i_ino; + node->fcd_ino = inode->i_ino; + if (dentry->d_name.len > DNAME_INLINE_LEN) { + node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); + if (!node->fcd_name.name) { + kmem_cache_free(ext4_fc_dentry_cachep, node); + ext4_fc_mark_ineligible(inode->i_sb, + EXT4_FC_REASON_MEM); + mutex_lock(&ei->i_fc_lock); + return -ENOMEM; + } + memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, + dentry->d_name.len); + } else { + memcpy(node->fcd_iname, dentry->d_name.name, + dentry->d_name.len); + node->fcd_name.name = node->fcd_iname; + } + node->fcd_name.len = dentry->d_name.len; + + spin_lock(&sbi->s_fc_lock); + if (sbi->s_mount_state & EXT4_FC_COMMITTING) + list_add_tail(&node->fcd_list, + &sbi->s_fc_dentry_q[FC_Q_STAGING]); + else + list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); + spin_unlock(&sbi->s_fc_lock); + mutex_lock(&ei->i_fc_lock); + + return 0; +} + +void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry) +{ + struct __track_dentry_update_args args; + int ret; + + args.dentry = dentry; + args.op = EXT4_FC_TAG_UNLINK; + + ret = ext4_fc_track_template(inode, __track_dentry_update, + (void *)&args, 0); + trace_ext4_fc_track_unlink(inode, dentry, ret); +} + +void ext4_fc_track_link(struct inode *inode, struct dentry *dentry) +{ + struct __track_dentry_update_args args; + int ret; + + args.dentry = dentry; + args.op = EXT4_FC_TAG_LINK; + + ret = ext4_fc_track_template(inode, __track_dentry_update, + (void *)&args, 0); + trace_ext4_fc_track_link(inode, dentry, ret); +} + +void ext4_fc_track_create(struct inode *inode, struct dentry *dentry) +{ + struct __track_dentry_update_args args; + int ret; + + args.dentry = dentry; + args.op = EXT4_FC_TAG_CREAT; + + ret = ext4_fc_track_template(inode, __track_dentry_update, + (void *)&args, 0); + trace_ext4_fc_track_create(inode, dentry, ret); +} + +/* __track_fn for inode tracking */ +static int __track_inode(struct inode *inode, void *arg, bool update) +{ + if (update) + return -EEXIST; + + EXT4_I(inode)->i_fc_lblk_len = 0; + + return 0; +} + +void ext4_fc_track_inode(struct inode *inode) +{ + int ret; + + if (S_ISDIR(inode->i_mode)) + return; + + ret = ext4_fc_track_template(inode, __track_inode, NULL, 1); + trace_ext4_fc_track_inode(inode, ret); +} + +struct __track_range_args { + ext4_lblk_t start, end; +}; + +/* __track_fn for tracking data updates */ +static int __track_range(struct inode *inode, void *arg, bool update) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_lblk_t oldstart; + struct __track_range_args *__arg = + (struct __track_range_args *)arg; + + if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { + ext4_debug("Special inode %ld being modified\n", inode->i_ino); + return -ECANCELED; + } + + oldstart = ei->i_fc_lblk_start; + + if (update && ei->i_fc_lblk_len > 0) { + ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); + ei->i_fc_lblk_len = + max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - + ei->i_fc_lblk_start + 1; + } else { + ei->i_fc_lblk_start = __arg->start; + ei->i_fc_lblk_len = __arg->end - __arg->start + 1; + } + + return 0; +} + +void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) +{ + struct __track_range_args args; + int ret; + + if (S_ISDIR(inode->i_mode)) + return; + + args.start = start; + args.end = end; + + ret = ext4_fc_track_template(inode, __track_range, &args, 1); + + trace_ext4_fc_track_range(inode, start, end, ret); +} + +static void ext4_fc_submit_bh(struct super_block *sb) +{ + int write_flags = REQ_SYNC; + struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; + + if (test_opt(sb, BARRIER)) + write_flags |= REQ_FUA | REQ_PREFLUSH; + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + bh->b_end_io = ext4_end_buffer_io_sync; + submit_bh(REQ_OP_WRITE, write_flags, bh); + EXT4_SB(sb)->s_fc_bh = NULL; +} + +/* Ext4 commit path routines */ + +/* memzero and update CRC */ +static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, + u32 *crc) +{ + void *ret; + + ret = memset(dst, 0, len); + if (crc) + *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); + return ret; +} + +/* + * Allocate len bytes on a fast commit buffer. + * + * During the commit time this function is used to manage fast commit + * block space. We don't split a fast commit log onto different + * blocks. So this function makes sure that if there's not enough space + * on the current block, the remaining space in the current block is + * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, + * new block is from jbd2 and CRC is updated to reflect the padding + * we added. + */ +static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) +{ + struct ext4_fc_tl *tl; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bh; + int bsize = sbi->s_journal->j_blocksize; + int ret, off = sbi->s_fc_bytes % bsize; + int pad_len; + + /* + * After allocating len, we should have space at least for a 0 byte + * padding. + */ + if (len + sizeof(struct ext4_fc_tl) > bsize) + return NULL; + + if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { + /* + * Only allocate from current buffer if we have enough space for + * this request AND we have space to add a zero byte padding. + */ + if (!sbi->s_fc_bh) { + ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); + if (ret) + return NULL; + sbi->s_fc_bh = bh; + } + sbi->s_fc_bytes += len; + return sbi->s_fc_bh->b_data + off; + } + /* Need to add PAD tag */ + tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); + tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); + pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); + tl->fc_len = cpu_to_le16(pad_len); + if (crc) + *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); + if (pad_len > 0) + ext4_fc_memzero(sb, tl + 1, pad_len, crc); + ext4_fc_submit_bh(sb); + + ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); + if (ret) + return NULL; + sbi->s_fc_bh = bh; + sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; + return sbi->s_fc_bh->b_data; +} + +/* memcpy to fc reserved space and update CRC */ +static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, + int len, u32 *crc) +{ + if (crc) + *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); + return memcpy(dst, src, len); +} + +/* + * Complete a fast commit by writing tail tag. + * + * Writing tail tag marks the end of a fast commit. In order to guarantee + * atomicity, after writing tail tag, even if there's space remaining + * in the block, next commit shouldn't use it. That's why tail tag + * has the length as that of the remaining space on the block. + */ +static int ext4_fc_write_tail(struct super_block *sb, u32 crc) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_fc_tl tl; + struct ext4_fc_tail tail; + int off, bsize = sbi->s_journal->j_blocksize; + u8 *dst; + + /* + * ext4_fc_reserve_space takes care of allocating an extra block if + * there's no enough space on this block for accommodating this tail. + */ + dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); + if (!dst) + return -ENOSPC; + + off = sbi->s_fc_bytes % bsize; + + tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); + tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); + sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); + + ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); + dst += sizeof(tl); + tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); + ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); + dst += sizeof(tail.fc_tid); + tail.fc_crc = cpu_to_le32(crc); + ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); + + ext4_fc_submit_bh(sb); + + return 0; +} + +/* + * Adds tag, length, value and updates CRC. Returns true if tlv was added. + * Returns false if there's not enough space. + */ +static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, + u32 *crc) +{ + struct ext4_fc_tl tl; + u8 *dst; + + dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); + if (!dst) + return false; + + tl.fc_tag = cpu_to_le16(tag); + tl.fc_len = cpu_to_le16(len); + + ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); + ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); + + return true; +} + +/* Same as above, but adds dentry tlv. */ +static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag, + int parent_ino, int ino, int dlen, + const unsigned char *dname, + u32 *crc) +{ + struct ext4_fc_dentry_info fcd; + struct ext4_fc_tl tl; + u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, + crc); + + if (!dst) + return false; + + fcd.fc_parent_ino = cpu_to_le32(parent_ino); + fcd.fc_ino = cpu_to_le32(ino); + tl.fc_tag = cpu_to_le16(tag); + tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); + ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); + dst += sizeof(tl); + ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); + dst += sizeof(fcd); + ext4_fc_memcpy(sb, dst, dname, dlen, crc); + dst += dlen; + + return true; +} + +/* + * Writes inode in the fast commit space under TLV with tag @tag. + * Returns 0 on success, error on failure. + */ +static int ext4_fc_write_inode(struct inode *inode, u32 *crc) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + int inode_len = EXT4_GOOD_OLD_INODE_SIZE; + int ret; + struct ext4_iloc iloc; + struct ext4_fc_inode fc_inode; + struct ext4_fc_tl tl; + u8 *dst; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) + inode_len += ei->i_extra_isize; + + fc_inode.fc_ino = cpu_to_le32(inode->i_ino); + tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); + tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); + + dst = ext4_fc_reserve_space(inode->i_sb, + sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); + if (!dst) + return -ECANCELED; + + if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) + return -ECANCELED; + dst += sizeof(tl); + if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) + return -ECANCELED; + dst += sizeof(fc_inode); + if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), + inode_len, crc)) + return -ECANCELED; + + return 0; +} + +/* + * Writes updated data ranges for the inode in question. Updates CRC. + * Returns 0 on success, error otherwise. + */ +static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) +{ + ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_map_blocks map; + struct ext4_fc_add_range fc_ext; + struct ext4_fc_del_range lrange; + struct ext4_extent *ex; + int ret; + + mutex_lock(&ei->i_fc_lock); + if (ei->i_fc_lblk_len == 0) { + mutex_unlock(&ei->i_fc_lock); + return 0; + } + old_blk_size = ei->i_fc_lblk_start; + new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; + ei->i_fc_lblk_len = 0; + mutex_unlock(&ei->i_fc_lock); + + cur_lblk_off = old_blk_size; + jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n", + __func__, cur_lblk_off, new_blk_size, inode->i_ino); + + while (cur_lblk_off <= new_blk_size) { + map.m_lblk = cur_lblk_off; + map.m_len = new_blk_size - cur_lblk_off + 1; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return -ECANCELED; + + if (map.m_len == 0) { + cur_lblk_off++; + continue; + } + + if (ret == 0) { + lrange.fc_ino = cpu_to_le32(inode->i_ino); + lrange.fc_lblk = cpu_to_le32(map.m_lblk); + lrange.fc_len = cpu_to_le32(map.m_len); + if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, + sizeof(lrange), (u8 *)&lrange, crc)) + return -ENOSPC; + } else { + fc_ext.fc_ino = cpu_to_le32(inode->i_ino); + ex = (struct ext4_extent *)&fc_ext.fc_ex; + ex->ee_block = cpu_to_le32(map.m_lblk); + ex->ee_len = cpu_to_le16(map.m_len); + ext4_ext_store_pblock(ex, map.m_pblk); + if (map.m_flags & EXT4_MAP_UNWRITTEN) + ext4_ext_mark_unwritten(ex); + else + ext4_ext_mark_initialized(ex); + if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, + sizeof(fc_ext), (u8 *)&fc_ext, crc)) + return -ENOSPC; + } + + cur_lblk_off += map.m_len; + } + + return 0; +} + + +/* Submit data for all the fast commit inodes */ +static int ext4_fc_submit_inode_data_all(journal_t *journal) +{ + struct super_block *sb = (struct super_block *)(journal->j_private); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_inode_info *ei; + struct list_head *pos; + int ret = 0; + + spin_lock(&sbi->s_fc_lock); + sbi->s_mount_state |= EXT4_FC_COMMITTING; + list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { + ei = list_entry(pos, struct ext4_inode_info, i_fc_list); + ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); + while (atomic_read(&ei->i_fc_updates)) { + DEFINE_WAIT(wait); + + prepare_to_wait(&ei->i_fc_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (atomic_read(&ei->i_fc_updates)) { + spin_unlock(&sbi->s_fc_lock); + schedule(); + spin_lock(&sbi->s_fc_lock); + } + finish_wait(&ei->i_fc_wait, &wait); + } + spin_unlock(&sbi->s_fc_lock); + ret = jbd2_submit_inode_data(ei->jinode); + if (ret) + return ret; + spin_lock(&sbi->s_fc_lock); + } + spin_unlock(&sbi->s_fc_lock); + + return ret; +} + +/* Wait for completion of data for all the fast commit inodes */ +static int ext4_fc_wait_inode_data_all(journal_t *journal) +{ + struct super_block *sb = (struct super_block *)(journal->j_private); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_inode_info *pos, *n; + int ret = 0; + + spin_lock(&sbi->s_fc_lock); + list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + if (!ext4_test_inode_state(&pos->vfs_inode, + EXT4_STATE_FC_COMMITTING)) + continue; + spin_unlock(&sbi->s_fc_lock); + + ret = jbd2_wait_inode_data(journal, pos->jinode); + if (ret) + return ret; + spin_lock(&sbi->s_fc_lock); + } + spin_unlock(&sbi->s_fc_lock); + + return 0; +} + +/* Commit all the directory entry updates */ +static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) +{ + struct super_block *sb = (struct super_block *)(journal->j_private); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_fc_dentry_update *fc_dentry; + struct inode *inode; + struct list_head *pos, *n, *fcd_pos, *fcd_n; + struct ext4_inode_info *ei; + int ret; + + if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) + return 0; + list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) { + fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update, + fcd_list); + if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { + spin_unlock(&sbi->s_fc_lock); + if (!ext4_fc_add_dentry_tlv( + sb, fc_dentry->fcd_op, + fc_dentry->fcd_parent, fc_dentry->fcd_ino, + fc_dentry->fcd_name.len, + fc_dentry->fcd_name.name, crc)) { + ret = -ENOSPC; + goto lock_and_exit; + } + spin_lock(&sbi->s_fc_lock); + continue; + } + + inode = NULL; + list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { + ei = list_entry(pos, struct ext4_inode_info, i_fc_list); + if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) { + inode = &ei->vfs_inode; + break; + } + } + /* + * If we don't find inode in our list, then it was deleted, + * in which case, we don't need to record it's create tag. + */ + if (!inode) + continue; + spin_unlock(&sbi->s_fc_lock); + + /* + * We first write the inode and then the create dirent. This + * allows the recovery code to create an unnamed inode first + * and then link it to a directory entry. This allows us + * to use namei.c routines almost as is and simplifies + * the recovery code. + */ + ret = ext4_fc_write_inode(inode, crc); + if (ret) + goto lock_and_exit; + + ret = ext4_fc_write_inode_data(inode, crc); + if (ret) + goto lock_and_exit; + + if (!ext4_fc_add_dentry_tlv( + sb, fc_dentry->fcd_op, + fc_dentry->fcd_parent, fc_dentry->fcd_ino, + fc_dentry->fcd_name.len, + fc_dentry->fcd_name.name, crc)) { + spin_lock(&sbi->s_fc_lock); + ret = -ENOSPC; + goto lock_and_exit; + } + + spin_lock(&sbi->s_fc_lock); + } + return 0; +lock_and_exit: + spin_lock(&sbi->s_fc_lock); + return ret; +} + +static int ext4_fc_perform_commit(journal_t *journal) +{ + struct super_block *sb = (struct super_block *)(journal->j_private); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_inode_info *iter; + struct ext4_fc_head head; + struct list_head *pos; + struct inode *inode; + struct blk_plug plug; + int ret = 0; + u32 crc = 0; + + ret = ext4_fc_submit_inode_data_all(journal); + if (ret) + return ret; + + ret = ext4_fc_wait_inode_data_all(journal); + if (ret) + return ret; + + blk_start_plug(&plug); + if (sbi->s_fc_bytes == 0) { + /* + * Add a head tag only if this is the first fast commit + * in this TID. + */ + head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); + head.fc_tid = cpu_to_le32( + sbi->s_journal->j_running_transaction->t_tid); + if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), + (u8 *)&head, &crc)) + goto out; + } + + spin_lock(&sbi->s_fc_lock); + ret = ext4_fc_commit_dentry_updates(journal, &crc); + if (ret) { + spin_unlock(&sbi->s_fc_lock); + goto out; + } + + list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { + iter = list_entry(pos, struct ext4_inode_info, i_fc_list); + inode = &iter->vfs_inode; + if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) + continue; + + spin_unlock(&sbi->s_fc_lock); + ret = ext4_fc_write_inode_data(inode, &crc); + if (ret) + goto out; + ret = ext4_fc_write_inode(inode, &crc); + if (ret) + goto out; + spin_lock(&sbi->s_fc_lock); + EXT4_I(inode)->i_fc_committed_subtid = + atomic_read(&sbi->s_fc_subtid); + } + spin_unlock(&sbi->s_fc_lock); + + ret = ext4_fc_write_tail(sb, crc); + +out: + blk_finish_plug(&plug); + return ret; +} + +/* + * The main commit entry point. Performs a fast commit for transaction + * commit_tid if needed. If it's not possible to perform a fast commit + * due to various reasons, we fall back to full commit. Returns 0 + * on success, error otherwise. + */ +int ext4_fc_commit(journal_t *journal, tid_t commit_tid) +{ + struct super_block *sb = (struct super_block *)(journal->j_private); + struct ext4_sb_info *sbi = EXT4_SB(sb); + int nblks = 0, ret, bsize = journal->j_blocksize; + int subtid = atomic_read(&sbi->s_fc_subtid); + int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0; + ktime_t start_time, commit_time; + + trace_ext4_fc_commit_start(sb); + + start_time = ktime_get(); + + if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || + (ext4_fc_is_ineligible(sb))) { + reason = EXT4_FC_REASON_INELIGIBLE; + goto out; + } + +restart_fc: + ret = jbd2_fc_begin_commit(journal, commit_tid); + if (ret == -EALREADY) { + /* There was an ongoing commit, check if we need to restart */ + if (atomic_read(&sbi->s_fc_subtid) <= subtid && + commit_tid > journal->j_commit_sequence) + goto restart_fc; + reason = EXT4_FC_REASON_ALREADY_COMMITTED; + goto out; + } else if (ret) { + sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; + reason = EXT4_FC_REASON_FC_START_FAILED; + goto out; + } + + fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; + ret = ext4_fc_perform_commit(journal); + if (ret < 0) { + sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; + reason = EXT4_FC_REASON_FC_FAILED; + goto out; + } + nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; + ret = jbd2_fc_wait_bufs(journal, nblks); + if (ret < 0) { + sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; + reason = EXT4_FC_REASON_FC_FAILED; + goto out; + } + atomic_inc(&sbi->s_fc_subtid); + jbd2_fc_end_commit(journal); +out: + /* Has any ineligible update happened since we started? */ + if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) { + sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; + reason = EXT4_FC_REASON_INELIGIBLE; + } + + spin_lock(&sbi->s_fc_lock); + if (reason != EXT4_FC_REASON_OK && + reason != EXT4_FC_REASON_ALREADY_COMMITTED) { + sbi->s_fc_stats.fc_ineligible_commits++; + } else { + sbi->s_fc_stats.fc_num_commits++; + sbi->s_fc_stats.fc_numblks += nblks; + } + spin_unlock(&sbi->s_fc_lock); + nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0; + trace_ext4_fc_commit_stop(sb, nblks, reason); + commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + /* + * weight the commit time higher than the average time so we don't + * react too strongly to vast changes in the commit time + */ + if (likely(sbi->s_fc_avg_commit_time)) + sbi->s_fc_avg_commit_time = (commit_time + + sbi->s_fc_avg_commit_time * 3) / 4; + else + sbi->s_fc_avg_commit_time = commit_time; + jbd_debug(1, + "Fast commit ended with blks = %d, reason = %d, subtid - %d", + nblks, reason, subtid); + if (reason == EXT4_FC_REASON_FC_FAILED) + return jbd2_fc_end_commit_fallback(journal, commit_tid); + if (reason == EXT4_FC_REASON_FC_START_FAILED || + reason == EXT4_FC_REASON_INELIGIBLE) + return jbd2_complete_transaction(journal, commit_tid); + return 0; +} + +/* + * Fast commit cleanup routine. This is called after every fast commit and + * full commit. full is true if we are called after a full commit. + */ +static void ext4_fc_cleanup(journal_t *journal, int full) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_inode_info *iter; + struct ext4_fc_dentry_update *fc_dentry; + struct list_head *pos, *n; + + if (full && sbi->s_fc_bh) + sbi->s_fc_bh = NULL; + + jbd2_fc_release_bufs(journal); + + spin_lock(&sbi->s_fc_lock); + list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { + iter = list_entry(pos, struct ext4_inode_info, i_fc_list); + list_del_init(&iter->i_fc_list); + ext4_clear_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_COMMITTING); + ext4_fc_reset_inode(&iter->vfs_inode); + /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ + smp_mb(); +#if (BITS_PER_LONG < 64) + wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); +#else + wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); +#endif + } + + while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { + fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], + struct ext4_fc_dentry_update, + fcd_list); + list_del_init(&fc_dentry->fcd_list); + spin_unlock(&sbi->s_fc_lock); + + if (fc_dentry->fcd_name.name && + fc_dentry->fcd_name.len > DNAME_INLINE_LEN) + kfree(fc_dentry->fcd_name.name); + kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); + spin_lock(&sbi->s_fc_lock); + } + + list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], + &sbi->s_fc_dentry_q[FC_Q_MAIN]); + list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], + &sbi->s_fc_q[FC_Q_STAGING]); + + sbi->s_mount_state &= ~EXT4_FC_COMMITTING; + sbi->s_mount_state &= ~EXT4_FC_INELIGIBLE; + + if (full) + sbi->s_fc_bytes = 0; + spin_unlock(&sbi->s_fc_lock); + trace_ext4_fc_stats(sb); +} + +/* Ext4 Replay Path Routines */ + +/* Get length of a particular tlv */ +static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl) +{ + return le16_to_cpu(tl->fc_len); +} + +/* Get a pointer to "value" of a tlv */ +static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl) +{ + return (u8 *)tl + sizeof(*tl); +} + +/* Helper struct for dentry replay routines */ +struct dentry_info_args { + int parent_ino, dname_len, ino, inode_len; + char *dname; +}; + +static inline void tl_to_darg(struct dentry_info_args *darg, + struct ext4_fc_tl *tl) +{ + struct ext4_fc_dentry_info *fcd; + + fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl); + + darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino); + darg->ino = le32_to_cpu(fcd->fc_ino); + darg->dname = fcd->fc_dname; + darg->dname_len = ext4_fc_tag_len(tl) - + sizeof(struct ext4_fc_dentry_info); +} + +/* Unlink replay function */ +static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl) +{ + struct inode *inode, *old_parent; + struct qstr entry; + struct dentry_info_args darg; + int ret = 0; + + tl_to_darg(&darg, tl); + + trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, + darg.parent_ino, darg.dname_len); + + entry.name = darg.dname; + entry.len = darg.dname_len; + inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); + + if (IS_ERR_OR_NULL(inode)) { + jbd_debug(1, "Inode %d not found", darg.ino); + return 0; + } + + old_parent = ext4_iget(sb, darg.parent_ino, + EXT4_IGET_NORMAL); + if (IS_ERR_OR_NULL(old_parent)) { + jbd_debug(1, "Dir with inode %d not found", darg.parent_ino); + iput(inode); + return 0; + } + + ret = __ext4_unlink(old_parent, &entry, inode); + /* -ENOENT ok coz it might not exist anymore. */ + if (ret == -ENOENT) + ret = 0; + iput(old_parent); + iput(inode); + return ret; +} + +static int ext4_fc_replay_link_internal(struct super_block *sb, + struct dentry_info_args *darg, + struct inode *inode) +{ + struct inode *dir = NULL; + struct dentry *dentry_dir = NULL, *dentry_inode = NULL; + struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); + int ret = 0; + + dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); + if (IS_ERR(dir)) { + jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino); + dir = NULL; + goto out; + } + + dentry_dir = d_obtain_alias(dir); + if (IS_ERR(dentry_dir)) { + jbd_debug(1, "Failed to obtain dentry"); + dentry_dir = NULL; + goto out; + } + + dentry_inode = d_alloc(dentry_dir, &qstr_dname); + if (!dentry_inode) { + jbd_debug(1, "Inode dentry not created."); + ret = -ENOMEM; + goto out; + } + + ret = __ext4_link(dir, inode, dentry_inode); + /* + * It's possible that link already existed since data blocks + * for the dir in question got persisted before we crashed OR + * we replayed this tag and crashed before the entire replay + * could complete. + */ + if (ret && ret != -EEXIST) { + jbd_debug(1, "Failed to link\n"); + goto out; + } + + ret = 0; +out: + if (dentry_dir) { + d_drop(dentry_dir); + dput(dentry_dir); + } else if (dir) { + iput(dir); + } + if (dentry_inode) { + d_drop(dentry_inode); + dput(dentry_inode); + } + + return ret; +} + +/* Link replay function */ +static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl) +{ + struct inode *inode; + struct dentry_info_args darg; + int ret = 0; + + tl_to_darg(&darg, tl); + trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, + darg.parent_ino, darg.dname_len); + + inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); + if (IS_ERR_OR_NULL(inode)) { + jbd_debug(1, "Inode not found."); + return 0; + } + + ret = ext4_fc_replay_link_internal(sb, &darg, inode); + iput(inode); + return ret; +} + +/* + * Record all the modified inodes during replay. We use this later to setup + * block bitmaps correctly. + */ +static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) +{ + struct ext4_fc_replay_state *state; + int i; + + state = &EXT4_SB(sb)->s_fc_replay_state; + for (i = 0; i < state->fc_modified_inodes_used; i++) + if (state->fc_modified_inodes[i] == ino) + return 0; + if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { + state->fc_modified_inodes_size += + EXT4_FC_REPLAY_REALLOC_INCREMENT; + state->fc_modified_inodes = krealloc( + state->fc_modified_inodes, sizeof(int) * + state->fc_modified_inodes_size, + GFP_KERNEL); + if (!state->fc_modified_inodes) + return -ENOMEM; + } + state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; + return 0; +} + +/* + * Inode replay function + */ +static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl) +{ + struct ext4_fc_inode *fc_inode; + struct ext4_inode *raw_inode; + struct ext4_inode *raw_fc_inode; + struct inode *inode = NULL; + struct ext4_iloc iloc; + int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); + struct ext4_extent_header *eh; + + fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl); + + ino = le32_to_cpu(fc_inode->fc_ino); + trace_ext4_fc_replay(sb, tag, ino, 0, 0); + + inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); + if (!IS_ERR_OR_NULL(inode)) { + ext4_ext_clear_bb(inode); + iput(inode); + } + + ext4_fc_record_modified_inode(sb, ino); + + raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode; + ret = ext4_get_fc_inode_loc(sb, ino, &iloc); + if (ret) + goto out; + + inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode); + raw_inode = ext4_raw_inode(&iloc); + + memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); + memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation, + inode_len - offsetof(struct ext4_inode, i_generation)); + if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { + eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); + if (eh->eh_magic != EXT4_EXT_MAGIC) { + memset(eh, 0, sizeof(*eh)); + eh->eh_magic = EXT4_EXT_MAGIC; + eh->eh_max = cpu_to_le16( + (sizeof(raw_inode->i_block) - + sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent)); + } + } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { + memcpy(raw_inode->i_block, raw_fc_inode->i_block, + sizeof(raw_inode->i_block)); + } + + /* Immediately update the inode on disk. */ + ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); + if (ret) + goto out; + ret = sync_dirty_buffer(iloc.bh); + if (ret) + goto out; + ret = ext4_mark_inode_used(sb, ino); + if (ret) + goto out; + + /* Given that we just wrote the inode on disk, this SHOULD succeed. */ + inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); + if (IS_ERR_OR_NULL(inode)) { + jbd_debug(1, "Inode not found."); + return -EFSCORRUPTED; + } + + /* + * Our allocator could have made different decisions than before + * crashing. This should be fixed but until then, we calculate + * the number of blocks the inode. + */ + ext4_ext_replay_set_iblocks(inode); + + inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); + ext4_reset_inode_seed(inode); + + ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); + ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); + sync_dirty_buffer(iloc.bh); + brelse(iloc.bh); +out: + iput(inode); + if (!ret) + blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); + + return 0; +} + +/* + * Dentry create replay function. + * + * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the + * inode for which we are trying to create a dentry here, should already have + * been replayed before we start here. + */ +static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl) +{ + int ret = 0; + struct inode *inode = NULL; + struct inode *dir = NULL; + struct dentry_info_args darg; + + tl_to_darg(&darg, tl); + + trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, + darg.parent_ino, darg.dname_len); + + /* This takes care of update group descriptor and other metadata */ + ret = ext4_mark_inode_used(sb, darg.ino); + if (ret) + goto out; + + inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); + if (IS_ERR_OR_NULL(inode)) { + jbd_debug(1, "inode %d not found.", darg.ino); + inode = NULL; + ret = -EINVAL; + goto out; + } + + if (S_ISDIR(inode->i_mode)) { + /* + * If we are creating a directory, we need to make sure that the + * dot and dot dot dirents are setup properly. + */ + dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); + if (IS_ERR_OR_NULL(dir)) { + jbd_debug(1, "Dir %d not found.", darg.ino); + goto out; + } + ret = ext4_init_new_dir(NULL, dir, inode); + iput(dir); + if (ret) { + ret = 0; + goto out; + } + } + ret = ext4_fc_replay_link_internal(sb, &darg, inode); + if (ret) + goto out; + set_nlink(inode, 1); + ext4_mark_inode_dirty(NULL, inode); +out: + if (inode) + iput(inode); + return ret; +} + +/* + * Record physical disk regions which are in use as per fast commit area. Our + * simple replay phase allocator excludes these regions from allocation. + */ +static int ext4_fc_record_regions(struct super_block *sb, int ino, + ext4_lblk_t lblk, ext4_fsblk_t pblk, int len) +{ + struct ext4_fc_replay_state *state; + struct ext4_fc_alloc_region *region; + + state = &EXT4_SB(sb)->s_fc_replay_state; + if (state->fc_regions_used == state->fc_regions_size) { + state->fc_regions_size += + EXT4_FC_REPLAY_REALLOC_INCREMENT; + state->fc_regions = krealloc( + state->fc_regions, + state->fc_regions_size * + sizeof(struct ext4_fc_alloc_region), + GFP_KERNEL); + if (!state->fc_regions) + return -ENOMEM; + } + region = &state->fc_regions[state->fc_regions_used++]; + region->ino = ino; + region->lblk = lblk; + region->pblk = pblk; + region->len = len; + + return 0; +} + +/* Replay add range tag */ +static int ext4_fc_replay_add_range(struct super_block *sb, + struct ext4_fc_tl *tl) +{ + struct ext4_fc_add_range *fc_add_ex; + struct ext4_extent newex, *ex; + struct inode *inode; + ext4_lblk_t start, cur; + int remaining, len; + ext4_fsblk_t start_pblk; + struct ext4_map_blocks map; + struct ext4_ext_path *path = NULL; + int ret; + + fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); + ex = (struct ext4_extent *)&fc_add_ex->fc_ex; + + trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, + le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex)); + + inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino), + EXT4_IGET_NORMAL); + if (IS_ERR_OR_NULL(inode)) { + jbd_debug(1, "Inode not found."); + return 0; + } + + ret = ext4_fc_record_modified_inode(sb, inode->i_ino); + + start = le32_to_cpu(ex->ee_block); + start_pblk = ext4_ext_pblock(ex); + len = ext4_ext_get_actual_len(ex); + + cur = start; + remaining = len; + jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", + start, start_pblk, len, ext4_ext_is_unwritten(ex), + inode->i_ino); + + while (remaining > 0) { + map.m_lblk = cur; + map.m_len = remaining; + map.m_pblk = 0; + ret = ext4_map_blocks(NULL, inode, &map, 0); + + if (ret < 0) { + iput(inode); + return 0; + } + + if (ret == 0) { + /* Range is not mapped */ + path = ext4_find_extent(inode, cur, NULL, 0); + if (!path) + continue; + memset(&newex, 0, sizeof(newex)); + newex.ee_block = cpu_to_le32(cur); + ext4_ext_store_pblock( + &newex, start_pblk + cur - start); + newex.ee_len = cpu_to_le16(map.m_len); + if (ext4_ext_is_unwritten(ex)) + ext4_ext_mark_unwritten(&newex); + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_ext_insert_extent( + NULL, inode, &path, &newex, 0); + up_write((&EXT4_I(inode)->i_data_sem)); + ext4_ext_drop_refs(path); + kfree(path); + if (ret) { + iput(inode); + return 0; + } + goto next; + } + + if (start_pblk + cur - start != map.m_pblk) { + /* + * Logical to physical mapping changed. This can happen + * if this range was removed and then reallocated to + * map to new physical blocks during a fast commit. + */ + ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, + ext4_ext_is_unwritten(ex), + start_pblk + cur - start); + if (ret) { + iput(inode); + return 0; + } + /* + * Mark the old blocks as free since they aren't used + * anymore. We maintain an array of all the modified + * inodes. In case these blocks are still used at either + * a different logical range in the same inode or in + * some different inode, we will mark them as allocated + * at the end of the FC replay using our array of + * modified inodes. + */ + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + goto next; + } + + /* Range is mapped and needs a state change */ + jbd_debug(1, "Converting from %d to %d %lld", + map.m_flags & EXT4_MAP_UNWRITTEN, + ext4_ext_is_unwritten(ex), map.m_pblk); + ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, + ext4_ext_is_unwritten(ex), map.m_pblk); + if (ret) { + iput(inode); + return 0; + } + /* + * We may have split the extent tree while toggling the state. + * Try to shrink the extent tree now. + */ + ext4_ext_replay_shrink_inode(inode, start + len); +next: + cur += map.m_len; + remaining -= map.m_len; + } + ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> + sb->s_blocksize_bits); + iput(inode); + return 0; +} + +/* Replay DEL_RANGE tag */ +static int +ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl) +{ + struct inode *inode; + struct ext4_fc_del_range *lrange; + struct ext4_map_blocks map; + ext4_lblk_t cur, remaining; + int ret; + + lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl); + cur = le32_to_cpu(lrange->fc_lblk); + remaining = le32_to_cpu(lrange->fc_len); + + trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, + le32_to_cpu(lrange->fc_ino), cur, remaining); + + inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL); + if (IS_ERR_OR_NULL(inode)) { + jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino)); + return 0; + } + + ret = ext4_fc_record_modified_inode(sb, inode->i_ino); + + jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", + inode->i_ino, le32_to_cpu(lrange->fc_lblk), + le32_to_cpu(lrange->fc_len)); + while (remaining > 0) { + map.m_lblk = cur; + map.m_len = remaining; + + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) { + iput(inode); + return 0; + } + if (ret > 0) { + remaining -= ret; + cur += ret; + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + } else { + remaining -= map.m_len; + cur += map.m_len; + } + } + + ret = ext4_punch_hole(inode, + le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits, + le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits); + if (ret) + jbd_debug(1, "ext4_punch_hole returned %d", ret); + ext4_ext_replay_shrink_inode(inode, + i_size_read(inode) >> sb->s_blocksize_bits); + ext4_mark_inode_dirty(NULL, inode); + iput(inode); + + return 0; +} + +static inline const char *tag2str(u16 tag) +{ + switch (tag) { + case EXT4_FC_TAG_LINK: + return "TAG_ADD_ENTRY"; + case EXT4_FC_TAG_UNLINK: + return "TAG_DEL_ENTRY"; + case EXT4_FC_TAG_ADD_RANGE: + return "TAG_ADD_RANGE"; + case EXT4_FC_TAG_CREAT: + return "TAG_CREAT_DENTRY"; + case EXT4_FC_TAG_DEL_RANGE: + return "TAG_DEL_RANGE"; + case EXT4_FC_TAG_INODE: + return "TAG_INODE"; + case EXT4_FC_TAG_PAD: + return "TAG_PAD"; + case EXT4_FC_TAG_TAIL: + return "TAG_TAIL"; + case EXT4_FC_TAG_HEAD: + return "TAG_HEAD"; + default: + return "TAG_ERROR"; + } +} + +static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) +{ + struct ext4_fc_replay_state *state; + struct inode *inode; + struct ext4_ext_path *path = NULL; + struct ext4_map_blocks map; + int i, ret, j; + ext4_lblk_t cur, end; + + state = &EXT4_SB(sb)->s_fc_replay_state; + for (i = 0; i < state->fc_modified_inodes_used; i++) { + inode = ext4_iget(sb, state->fc_modified_inodes[i], + EXT4_IGET_NORMAL); + if (IS_ERR_OR_NULL(inode)) { + jbd_debug(1, "Inode %d not found.", + state->fc_modified_inodes[i]); + continue; + } + cur = 0; + end = EXT_MAX_BLOCKS; + while (cur < end) { + map.m_lblk = cur; + map.m_len = end - cur; + + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + break; + + if (ret > 0) { + path = ext4_find_extent(inode, map.m_lblk, NULL, 0); + if (!IS_ERR_OR_NULL(path)) { + for (j = 0; j < path->p_depth; j++) + ext4_mb_mark_bb(inode->i_sb, + path[j].p_block, 1, 1); + ext4_ext_drop_refs(path); + kfree(path); + } + cur += ret; + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, + map.m_len, 1); + } else { + cur = cur + (map.m_len ? map.m_len : 1); + } + } + iput(inode); + } +} + +/* + * Check if block is in excluded regions for block allocation. The simple + * allocator that runs during replay phase is calls this function to see + * if it is okay to use a block. + */ +bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) +{ + int i; + struct ext4_fc_replay_state *state; + + state = &EXT4_SB(sb)->s_fc_replay_state; + for (i = 0; i < state->fc_regions_valid; i++) { + if (state->fc_regions[i].ino == 0 || + state->fc_regions[i].len == 0) + continue; + if (blk >= state->fc_regions[i].pblk && + blk < state->fc_regions[i].pblk + state->fc_regions[i].len) + return true; + } + return false; +} + +/* Cleanup function called after replay */ +void ext4_fc_replay_cleanup(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + sbi->s_mount_state &= ~EXT4_FC_REPLAY; + kfree(sbi->s_fc_replay_state.fc_regions); + kfree(sbi->s_fc_replay_state.fc_modified_inodes); +} + +/* + * Recovery Scan phase handler + * + * This function is called during the scan phase and is responsible + * for doing following things: + * - Make sure the fast commit area has valid tags for replay + * - Count number of tags that need to be replayed by the replay handler + * - Verify CRC + * - Create a list of excluded blocks for allocation during replay phase + * + * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is + * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP + * to indicate that scan has finished and JBD2 can now start replay phase. + * It returns a negative error to indicate that there was an error. At the end + * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set + * to indicate the number of tags that need to replayed during the replay phase. + */ +static int ext4_fc_replay_scan(journal_t *journal, + struct buffer_head *bh, int off, + tid_t expected_tid) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_fc_replay_state *state; + int ret = JBD2_FC_REPLAY_CONTINUE; + struct ext4_fc_add_range *ext; + struct ext4_fc_tl *tl; + struct ext4_fc_tail *tail; + __u8 *start, *end; + struct ext4_fc_head *head; + struct ext4_extent *ex; + + state = &sbi->s_fc_replay_state; + + start = (u8 *)bh->b_data; + end = (__u8 *)bh->b_data + journal->j_blocksize - 1; + + if (state->fc_replay_expected_off == 0) { + state->fc_cur_tag = 0; + state->fc_replay_num_tags = 0; + state->fc_crc = 0; + state->fc_regions = NULL; + state->fc_regions_valid = state->fc_regions_used = + state->fc_regions_size = 0; + /* Check if we can stop early */ + if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) + != EXT4_FC_TAG_HEAD) + return 0; + } + + if (off != state->fc_replay_expected_off) { + ret = -EFSCORRUPTED; + goto out_err; + } + + state->fc_replay_expected_off++; + fc_for_each_tl(start, end, tl) { + jbd_debug(3, "Scan phase, tag:%s, blk %lld\n", + tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr); + switch (le16_to_cpu(tl->fc_tag)) { + case EXT4_FC_TAG_ADD_RANGE: + ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); + ex = (struct ext4_extent *)&ext->fc_ex; + ret = ext4_fc_record_regions(sb, + le32_to_cpu(ext->fc_ino), + le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), + ext4_ext_get_actual_len(ex)); + if (ret < 0) + break; + ret = JBD2_FC_REPLAY_CONTINUE; + fallthrough; + case EXT4_FC_TAG_DEL_RANGE: + case EXT4_FC_TAG_LINK: + case EXT4_FC_TAG_UNLINK: + case EXT4_FC_TAG_CREAT: + case EXT4_FC_TAG_INODE: + case EXT4_FC_TAG_PAD: + state->fc_cur_tag++; + state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, + sizeof(*tl) + ext4_fc_tag_len(tl)); + break; + case EXT4_FC_TAG_TAIL: + state->fc_cur_tag++; + tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); + state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, + sizeof(*tl) + + offsetof(struct ext4_fc_tail, + fc_crc)); + if (le32_to_cpu(tail->fc_tid) == expected_tid && + le32_to_cpu(tail->fc_crc) == state->fc_crc) { + state->fc_replay_num_tags = state->fc_cur_tag; + state->fc_regions_valid = + state->fc_regions_used; + } else { + ret = state->fc_replay_num_tags ? + JBD2_FC_REPLAY_STOP : -EFSBADCRC; + } + state->fc_crc = 0; + break; + case EXT4_FC_TAG_HEAD: + head = (struct ext4_fc_head *)ext4_fc_tag_val(tl); + if (le32_to_cpu(head->fc_features) & + ~EXT4_FC_SUPPORTED_FEATURES) { + ret = -EOPNOTSUPP; + break; + } + if (le32_to_cpu(head->fc_tid) != expected_tid) { + ret = JBD2_FC_REPLAY_STOP; + break; + } + state->fc_cur_tag++; + state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, + sizeof(*tl) + ext4_fc_tag_len(tl)); + break; + default: + ret = state->fc_replay_num_tags ? + JBD2_FC_REPLAY_STOP : -ECANCELED; + } + if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) + break; + } + +out_err: + trace_ext4_fc_replay_scan(sb, ret, off); + return ret; +} + +/* + * Main recovery path entry point. + * The meaning of return codes is similar as above. + */ +static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, + enum passtype pass, int off, tid_t expected_tid) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_fc_tl *tl; + __u8 *start, *end; + int ret = JBD2_FC_REPLAY_CONTINUE; + struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; + struct ext4_fc_tail *tail; + + if (pass == PASS_SCAN) { + state->fc_current_pass = PASS_SCAN; + return ext4_fc_replay_scan(journal, bh, off, expected_tid); + } + + if (state->fc_current_pass != pass) { + state->fc_current_pass = pass; + sbi->s_mount_state |= EXT4_FC_REPLAY; + } + if (!sbi->s_fc_replay_state.fc_replay_num_tags) { + jbd_debug(1, "Replay stops\n"); + ext4_fc_set_bitmaps_and_counters(sb); + return 0; + } + +#ifdef CONFIG_EXT4_DEBUG + if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { + pr_warn("Dropping fc block %d because max_replay set\n", off); + return JBD2_FC_REPLAY_STOP; + } +#endif + + start = (u8 *)bh->b_data; + end = (__u8 *)bh->b_data + journal->j_blocksize - 1; + + fc_for_each_tl(start, end, tl) { + if (state->fc_replay_num_tags == 0) { + ret = JBD2_FC_REPLAY_STOP; + ext4_fc_set_bitmaps_and_counters(sb); + break; + } + jbd_debug(3, "Replay phase, tag:%s\n", + tag2str(le16_to_cpu(tl->fc_tag))); + state->fc_replay_num_tags--; + switch (le16_to_cpu(tl->fc_tag)) { + case EXT4_FC_TAG_LINK: + ret = ext4_fc_replay_link(sb, tl); + break; + case EXT4_FC_TAG_UNLINK: + ret = ext4_fc_replay_unlink(sb, tl); + break; + case EXT4_FC_TAG_ADD_RANGE: + ret = ext4_fc_replay_add_range(sb, tl); + break; + case EXT4_FC_TAG_CREAT: + ret = ext4_fc_replay_create(sb, tl); + break; + case EXT4_FC_TAG_DEL_RANGE: + ret = ext4_fc_replay_del_range(sb, tl); + break; + case EXT4_FC_TAG_INODE: + ret = ext4_fc_replay_inode(sb, tl); + break; + case EXT4_FC_TAG_PAD: + trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, + ext4_fc_tag_len(tl), 0); + break; + case EXT4_FC_TAG_TAIL: + trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, + ext4_fc_tag_len(tl), 0); + tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); + WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid); + break; + case EXT4_FC_TAG_HEAD: + break; + default: + trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0, + ext4_fc_tag_len(tl), 0); + ret = -ECANCELED; + break; + } + if (ret < 0) + break; + ret = JBD2_FC_REPLAY_CONTINUE; + } + return ret; +} + +void ext4_fc_init(struct super_block *sb, journal_t *journal) +{ + /* + * We set replay callback even if fast commit disabled because we may + * could still have fast commit blocks that need to be replayed even if + * fast commit has now been turned off. + */ + journal->j_fc_replay_callback = ext4_fc_replay; + if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) + return; + journal->j_fc_cleanup_callback = ext4_fc_cleanup; + if (jbd2_fc_init(journal, EXT4_NUM_FC_BLKS)) { + pr_warn("Error while enabling fast commits, turning off."); + ext4_clear_feature_fast_commit(sb); + } +} + +const char *fc_ineligible_reasons[] = { + "Extended attributes changed", + "Cross rename", + "Journal flag changed", + "Insufficient memory", + "Swap boot", + "Resize", + "Dir renamed", + "Falloc range op", + "FC Commit Failed" +}; + +int ext4_fc_info_show(struct seq_file *seq, void *v) +{ + struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); + struct ext4_fc_stats *stats = &sbi->s_fc_stats; + int i; + + if (v != SEQ_START_TOKEN) + return 0; + + seq_printf(seq, + "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", + stats->fc_num_commits, stats->fc_ineligible_commits, + stats->fc_numblks, + div_u64(sbi->s_fc_avg_commit_time, 1000)); + seq_puts(seq, "Ineligible reasons:\n"); + for (i = 0; i < EXT4_FC_REASON_MAX; i++) + seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], + stats->fc_ineligible_reason_count[i]); + + return 0; +} + +int __init ext4_fc_init_dentry_cache(void) +{ + ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, + SLAB_RECLAIM_ACCOUNT); + + if (ext4_fc_dentry_cachep == NULL) + return -ENOMEM; + + return 0; +} diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h new file mode 100644 index 000000000000..06907d485989 --- /dev/null +++ b/fs/ext4/fast_commit.h @@ -0,0 +1,159 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __FAST_COMMIT_H__ +#define __FAST_COMMIT_H__ + +/* Number of blocks in journal area to allocate for fast commits */ +#define EXT4_NUM_FC_BLKS 256 + +/* Fast commit tags */ +#define EXT4_FC_TAG_ADD_RANGE 0x0001 +#define EXT4_FC_TAG_DEL_RANGE 0x0002 +#define EXT4_FC_TAG_CREAT 0x0003 +#define EXT4_FC_TAG_LINK 0x0004 +#define EXT4_FC_TAG_UNLINK 0x0005 +#define EXT4_FC_TAG_INODE 0x0006 +#define EXT4_FC_TAG_PAD 0x0007 +#define EXT4_FC_TAG_TAIL 0x0008 +#define EXT4_FC_TAG_HEAD 0x0009 + +#define EXT4_FC_SUPPORTED_FEATURES 0x0 + +/* On disk fast commit tlv value structures */ + +/* Fast commit on disk tag length structure */ +struct ext4_fc_tl { + __le16 fc_tag; + __le16 fc_len; +}; + +/* Value structure for tag EXT4_FC_TAG_HEAD. */ +struct ext4_fc_head { + __le32 fc_features; + __le32 fc_tid; +}; + +/* Value structure for EXT4_FC_TAG_ADD_RANGE. */ +struct ext4_fc_add_range { + __le32 fc_ino; + __u8 fc_ex[12]; +}; + +/* Value structure for tag EXT4_FC_TAG_DEL_RANGE. */ +struct ext4_fc_del_range { + __le32 fc_ino; + __le32 fc_lblk; + __le32 fc_len; +}; + +/* + * This is the value structure for tags EXT4_FC_TAG_CREAT, EXT4_FC_TAG_LINK + * and EXT4_FC_TAG_UNLINK. + */ +struct ext4_fc_dentry_info { + __le32 fc_parent_ino; + __le32 fc_ino; + u8 fc_dname[0]; +}; + +/* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */ +struct ext4_fc_inode { + __le32 fc_ino; + __u8 fc_raw_inode[0]; +}; + +/* Value structure for tag EXT4_FC_TAG_TAIL. */ +struct ext4_fc_tail { + __le32 fc_tid; + __le32 fc_crc; +}; + +/* + * In memory list of dentry updates that are performed on the file + * system used by fast commit code. + */ +struct ext4_fc_dentry_update { + int fcd_op; /* Type of update create / unlink / link */ + int fcd_parent; /* Parent inode number */ + int fcd_ino; /* Inode number */ + struct qstr fcd_name; /* Dirent name */ + unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */ + struct list_head fcd_list; +}; + +/* + * Fast commit reason codes + */ +enum { + /* + * Commit status codes: + */ + EXT4_FC_REASON_OK = 0, + EXT4_FC_REASON_INELIGIBLE, + EXT4_FC_REASON_ALREADY_COMMITTED, + EXT4_FC_REASON_FC_START_FAILED, + EXT4_FC_REASON_FC_FAILED, + + /* + * Fast commit ineligiblity reasons: + */ + EXT4_FC_REASON_XATTR = 0, + EXT4_FC_REASON_CROSS_RENAME, + EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, + EXT4_FC_REASON_MEM, + EXT4_FC_REASON_SWAP_BOOT, + EXT4_FC_REASON_RESIZE, + EXT4_FC_REASON_RENAME_DIR, + EXT4_FC_REASON_FALLOC_RANGE, + EXT4_FC_COMMIT_FAILED, + EXT4_FC_REASON_MAX +}; + +struct ext4_fc_stats { + unsigned int fc_ineligible_reason_count[EXT4_FC_REASON_MAX]; + unsigned long fc_num_commits; + unsigned long fc_ineligible_commits; + unsigned long fc_numblks; +}; + +#define EXT4_FC_REPLAY_REALLOC_INCREMENT 4 + +/* + * Physical block regions added to different inodes due to fast commit + * recovery. These are set during the SCAN phase. During the replay phase, + * our allocator excludes these from its allocation. This ensures that + * we don't accidentally allocating a block that is going to be used by + * another inode. + */ +struct ext4_fc_alloc_region { + ext4_lblk_t lblk; + ext4_fsblk_t pblk; + int ino, len; +}; + +/* + * Fast commit replay state. + */ +struct ext4_fc_replay_state { + int fc_replay_num_tags; + int fc_replay_expected_off; + int fc_current_pass; + int fc_cur_tag; + int fc_crc; + struct ext4_fc_alloc_region *fc_regions; + int fc_regions_size, fc_regions_used, fc_regions_valid; + int *fc_modified_inodes; + int fc_modified_inodes_used, fc_modified_inodes_size; +}; + +#define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1) + +#define fc_for_each_tl(__start, __end, __tl) \ + for (tl = (struct ext4_fc_tl *)start; \ + (u8 *)tl < (u8 *)end; \ + tl = (struct ext4_fc_tl *)((u8 *)tl + \ + sizeof(struct ext4_fc_tl) + \ + + le16_to_cpu(tl->fc_len))) + + +#endif /* __FAST_COMMIT_H__ */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 7d61069531d3..d85412d12e3a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -260,6 +260,7 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; + ext4_fc_start_update(inode); inode_lock(inode); ret = ext4_write_checks(iocb, from); if (ret <= 0) @@ -271,6 +272,7 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, out: inode_unlock(inode); + ext4_fc_stop_update(inode); if (likely(ret > 0)) { iocb->ki_pos += ret; ret = generic_write_sync(iocb, ret); @@ -534,7 +536,9 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } + ext4_fc_start_update(inode); ret = ext4_orphan_add(handle, inode); + ext4_fc_stop_update(inode); if (ret) { ext4_journal_stop(handle); goto out; @@ -656,8 +660,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) #endif if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_write_iter(iocb, from); - - return ext4_buffered_write_iter(iocb, from); + else + return ext4_buffered_write_iter(iocb, from); } #ifdef CONFIG_FS_DAX @@ -757,6 +761,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) if (!daxdev_mapping_supported(vma, dax_dev)) return -EOPNOTSUPP; + ext4_fc_start_update(inode); file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; @@ -764,6 +769,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) } else { vma->vm_ops = &ext4_file_vm_ops; } + ext4_fc_stop_update(inode); return 0; } @@ -844,7 +850,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) return ret; } - filp->f_mode |= FMODE_NOWAIT; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; return dquot_file_open(inode, filp); } diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index dbccf46f1770..b232c2767534 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -108,6 +108,9 @@ static int ext4_getfsmap_helper(struct super_block *sb, /* Are we just counting mappings? */ if (info->gfi_head->fmh_count == 0) { + if (info->gfi_head->fmh_entries == UINT_MAX) + return EXT4_QUERY_RANGE_ABORT; + if (rec_fsblk > info->gfi_next_fsblk) info->gfi_head->fmh_entries++; @@ -571,8 +574,8 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb, if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev)) return true; - if (EXT4_SB(sb)->journal_bdev && - fm->fmr_device == new_encode_dev(EXT4_SB(sb)->journal_bdev->bd_dev)) + if (EXT4_SB(sb)->s_journal_bdev && + fm->fmr_device == new_encode_dev(EXT4_SB(sb)->s_journal_bdev->bd_dev)) return true; return false; } @@ -642,9 +645,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, memset(handlers, 0, sizeof(handlers)); handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev); handlers[0].gfd_fn = ext4_getfsmap_datadev; - if (EXT4_SB(sb)->journal_bdev) { + if (EXT4_SB(sb)->s_journal_bdev) { handlers[1].gfd_dev = new_encode_dev( - EXT4_SB(sb)->journal_bdev->bd_dev); + EXT4_SB(sb)->s_journal_bdev->bd_dev); handlers[1].gfd_fn = ext4_getfsmap_logdev; } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 1d668c8f131f..81a545fd14a3 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -112,7 +112,7 @@ static int ext4_fsync_journal(struct inode *inode, bool datasync, !jbd2_trans_will_send_data_barrier(journal, commit_tid)) *needs_barrier = true; - return jbd2_complete_transaction(journal, commit_tid); + return ext4_fc_commit(journal, commit_tid); } /* @@ -150,7 +150,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = file_write_and_wait_range(file, start, end); if (ret) - return ret; + goto out; /* * data=writeback,ordered: diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 698ca4a4db5f..b215c564bc31 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -82,7 +82,12 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, struct buffer_head *bh) { ext4_fsblk_t blk; - struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); + struct ext4_group_info *grp; + + if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) + return 0; + + grp = ext4_get_group_info(sb, block_group); if (buffer_verified(bh)) return 0; @@ -189,10 +194,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) * submit the buffer_head for reading */ trace_ext4_load_inode_bitmap(sb, block_group); - bh->b_end_io = ext4_end_bitmap_read; - get_bh(bh); - submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); - wait_on_buffer(bh); + ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read); ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO); if (!buffer_uptodate(bh)) { put_bh(bh); @@ -284,15 +286,17 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); /* Don't bother if the inode bitmap is corrupt. */ - grp = ext4_get_group_info(sb, block_group); if (IS_ERR(bitmap_bh)) { fatal = PTR_ERR(bitmap_bh); bitmap_bh = NULL; goto error_return; } - if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { - fatal = -EFSCORRUPTED; - goto error_return; + if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { + grp = ext4_get_group_info(sb, block_group); + if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { + fatal = -EFSCORRUPTED; + goto error_return; + } } BUFFER_TRACE(bitmap_bh, "get_write_access"); @@ -742,6 +746,122 @@ not_found: return 1; } +int ext4_mark_inode_used(struct super_block *sb, int ino) +{ + unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); + struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL; + struct ext4_group_desc *gdp; + ext4_group_t group; + int bit; + int err = -EFSCORRUPTED; + + if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) + goto out; + + group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); + inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); + if (IS_ERR(inode_bitmap_bh)) + return PTR_ERR(inode_bitmap_bh); + + if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) { + err = 0; + goto out; + } + + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); + if (!gdp || !group_desc_bh) { + err = -EINVAL; + goto out; + } + + ext4_set_bit(bit, inode_bitmap_bh->b_data); + + BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + err = sync_dirty_buffer(inode_bitmap_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + + /* We may have to initialize the block bitmap if it isn't already */ + if (ext4_has_group_desc_csum(sb) && + gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + struct buffer_head *block_bitmap_bh; + + block_bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(block_bitmap_bh)) { + err = PTR_ERR(block_bitmap_bh); + goto out; + } + + BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); + err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh); + sync_dirty_buffer(block_bitmap_bh); + + /* recheck and clear flag under lock if we still need to */ + ext4_lock_group(sb, group); + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, group, gdp)); + ext4_block_bitmap_csum_set(sb, group, gdp, + block_bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + } + ext4_unlock_group(sb, group); + brelse(block_bitmap_bh); + + if (err) { + ext4_std_error(sb, err); + goto out; + } + } + + /* Update the relevant bg descriptor fields */ + if (ext4_has_group_desc_csum(sb)) { + int free; + + ext4_lock_group(sb, group); /* while we modify the bg desc */ + free = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + free = 0; + } + + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + */ + if (bit >= free) + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - bit - 1)); + } else { + ext4_lock_group(sb, group); + } + + ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); + if (ext4_has_group_desc_csum(sb)) { + ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh, + EXT4_INODES_PER_GROUP(sb) / 8); + ext4_group_desc_csum_set(sb, group, gdp); + } + + ext4_unlock_group(sb, group); + err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh); + sync_dirty_buffer(group_desc_bh); +out: + return err; +} + static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, bool encrypt) { @@ -818,7 +938,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, struct inode *ret; ext4_group_t i; ext4_group_t flex_group; - struct ext4_group_info *grp; + struct ext4_group_info *grp = NULL; bool encrypt = false; /* Cannot create files in a deleted directory */ @@ -918,15 +1038,21 @@ got_group: if (ext4_free_inodes_count(sb, gdp) == 0) goto next_group; - grp = ext4_get_group_info(sb, group); - /* Skip groups with already-known suspicious inode tables */ - if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) - goto next_group; + if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { + grp = ext4_get_group_info(sb, group); + /* + * Skip groups with already-known suspicious inode + * tables + */ + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + goto next_group; + } brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); /* Skip groups with suspicious inode tables */ - if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || + if (((!(sbi->s_mount_state & EXT4_FC_REPLAY)) + && EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || IS_ERR(inode_bitmap_bh)) { inode_bitmap_bh = NULL; goto next_group; @@ -945,7 +1071,7 @@ repeat_in_this_group: goto next_group; } - if (!handle) { + if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, handle_type, nblocks, 0, @@ -1049,9 +1175,15 @@ got: /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { int free; - struct ext4_group_info *grp = ext4_get_group_info(sb, group); - - down_read(&grp->alloc_sem); /* protect vs itable lazyinit */ + struct ext4_group_info *grp = NULL; + + if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { + grp = ext4_get_group_info(sb, group); + down_read(&grp->alloc_sem); /* + * protect vs itable + * lazyinit + */ + } ext4_lock_group(sb, group); /* while we modify the bg desc */ free = EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp); @@ -1067,7 +1199,8 @@ got: if (ino > free) ext4_itable_unused_set(sb, gdp, (EXT4_INODES_PER_GROUP(sb) - ino)); - up_read(&grp->alloc_sem); + if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) + up_read(&grp->alloc_sem); } else { ext4_lock_group(sb, group); } diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 80c9f33800be..05efa682bc2f 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -163,7 +163,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, } if (!bh_uptodate_or_lock(bh)) { - if (bh_submit_read(bh) < 0) { + if (ext4_read_bh(bh, 0, NULL) < 0) { put_bh(bh); goto failure; } @@ -593,7 +593,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, if (ext4_has_feature_bigalloc(inode->i_sb)) { EXT4_ERROR_INODE(inode, "Can't allocate blocks for " "non-extent mapped inodes with bigalloc"); - return -EFSCORRUPTED; + err = -EFSCORRUPTED; + goto out; } /* Set up for the direct block allocation */ @@ -1012,14 +1013,14 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, } /* Go read the buffer for the next level down */ - bh = sb_bread(inode->i_sb, nr); + bh = ext4_sb_bread(inode->i_sb, nr, 0); /* * A read failure? Report error and clear slot * (should be rare). */ - if (!bh) { - ext4_error_inode_block(inode, nr, EIO, + if (IS_ERR(bh)) { + ext4_error_inode_block(inode, nr, -PTR_ERR(bh), "Read failure"); continue; } @@ -1033,7 +1034,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, brelse(bh); /* - * Everything below this this pointer has been + * Everything below this pointer has been * released. Now let this top-of-subtree go. * * We want the freeing of this indirect block to be diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 75c97bca0815..caa51473207d 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -354,7 +354,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, if (error) goto out; - /* Update the xttr entry. */ + /* Update the xattr entry. */ i.value = value; i.value_len = len; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bf596467c234..03c2253005f0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -101,8 +101,8 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, return provided == calculated; } -static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, - struct ext4_inode_info *ei) +void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei) { __u32 csum; @@ -514,7 +514,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, return -EFSCORRUPTED; /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -729,6 +730,8 @@ out_sem: if (ret) return ret; } + ext4_fc_track_range(inode, map->m_lblk, + map->m_lblk + map->m_len - 1); } if (retval < 0) @@ -825,7 +828,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, int create = map_flags & EXT4_GET_BLOCKS_CREATE; int err; - J_ASSERT(handle != NULL || create == 0); + J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + || handle != NULL || create == 0); map.m_lblk = block; map.m_len = 1; @@ -841,7 +845,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { J_ASSERT(create != 0); - J_ASSERT(handle != NULL); + J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + || (handle != NULL)); /* * Now that we do not always journal data, we should @@ -878,18 +883,20 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, ext4_lblk_t block, int map_flags) { struct buffer_head *bh; + int ret; bh = ext4_getblk(handle, inode, block, map_flags); if (IS_ERR(bh)) return bh; if (!bh || ext4_buffer_uptodate(bh)) return bh; - ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh); - wait_on_buffer(bh); - if (buffer_uptodate(bh)) - return bh; - put_bh(bh); - return ERR_PTR(-EIO); + + ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true); + if (ret) { + put_bh(bh); + return ERR_PTR(ret); + } + return bh; } /* Read a contiguous batch of blocks. */ @@ -910,8 +917,7 @@ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, for (i = 0; i < bh_count; i++) /* Note that NULL bhs[i] is valid because of holes. */ if (bhs[i] && !ext4_buffer_uptodate(bhs[i])) - ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, - &bhs[i]); + ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false); if (!wait) return 0; @@ -1081,7 +1087,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + ext4_read_bh_lock(bh, 0, false); wait[nr_wait++] = bh; } } @@ -1912,6 +1918,9 @@ static int __ext4_journalled_writepage(struct page *page, } if (ret == 0) ret = err; + err = ext4_jbd2_inode_add_write(handle, inode, 0, len); + if (ret == 0) + ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; err = ext4_journal_stop(handle); if (!ret) @@ -2254,7 +2263,7 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, err = PTR_ERR(io_end_vec); goto out; } - io_end_vec->offset = mpd->map.m_lblk << blkbits; + io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits; } *map_bh = true; goto out; @@ -2785,7 +2794,7 @@ retry: * ext4_journal_stop() can wait for transaction commit * to finish which may depend on writeback of pages to * complete or on page lock to be released. In that - * case, we have to wait until after after we have + * case, we have to wait until after we have * submitted all the IO, released page locks we hold, * and dropped io_end reference (for extent conversion * to be able to complete) before stopping the handle. @@ -3296,9 +3305,14 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - if (journal) - return !jbd2_transaction_committed(journal, - EXT4_I(inode)->i_datasync_tid); + if (journal) { + if (jbd2_transaction_committed(journal, + EXT4_I(inode)->i_datasync_tid)) + return true; + return atomic_read(&EXT4_SB(inode->i_sb)->s_fc_subtid) >= + EXT4_I(inode)->i_fc_committed_subtid; + } + /* Any metadata buffers to write? */ if (!list_empty(&inode->i_mapping->private_list)) return true; @@ -3436,14 +3450,26 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; - if (flags & IOMAP_WRITE) + if (flags & IOMAP_WRITE) { + /* + * We check here if the blocks are already allocated, then we + * don't need to start a journal txn and we can directly return + * the mapping information. This could boost performance + * especially in multi-threaded overwrite requests. + */ + if (offset + length <= i_size_read(inode)) { + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED)) + goto out; + } ret = ext4_iomap_alloc(inode, &map, flags); - else + } else { ret = ext4_map_blocks(NULL, inode, &map, 0); + } if (ret < 0) return ret; - +out: ext4_set_iomap(inode, iomap, &map, offset, length); return 0; @@ -3601,6 +3627,13 @@ static int ext4_set_page_dirty(struct page *page) return __set_page_dirty_buffers(page); } +static int ext4_iomap_swap_activate(struct swap_info_struct *sis, + struct file *file, sector_t *span) +{ + return iomap_swapfile_activate(sis, file, span, + &ext4_iomap_report_ops); +} + static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, .readahead = ext4_readahead, @@ -3616,6 +3649,7 @@ static const struct address_space_operations ext4_aops = { .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, + .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_journalled_aops = { @@ -3632,6 +3666,7 @@ static const struct address_space_operations ext4_journalled_aops = { .direct_IO = noop_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, + .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_da_aops = { @@ -3649,6 +3684,7 @@ static const struct address_space_operations ext4_da_aops = { .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, + .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_dax_aops = { @@ -3657,6 +3693,7 @@ static const struct address_space_operations ext4_dax_aops = { .set_page_dirty = noop_set_page_dirty, .bmap = ext4_bmap, .invalidatepage = noop_invalidatepage, + .swap_activate = ext4_iomap_swap_activate, }; void ext4_set_aops(struct inode *inode) @@ -3730,11 +3767,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - err = -EIO; - ll_rw_block(REQ_OP_READ, 0, 1, &bh); - wait_on_buffer(bh); - /* Uhhuh. Read error. Complain and punt. */ - if (!buffer_uptodate(bh)) + err = ext4_read_bh_lock(bh, 0, true); + if (err) goto unlock; if (fscrypt_inode_uses_fs_layer_crypto(inode)) { /* We expect the key to be set. */ @@ -4073,6 +4107,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) up_write(&EXT4_I(inode)->i_data_sem); } + ext4_fc_track_range(inode, first_block, stop_block); if (IS_SYNC(inode)) ext4_handle_sync(handle); @@ -4252,22 +4287,22 @@ out_trace: * data in memory that is needed to recreate the on-disk version of this * inode. */ -static int __ext4_get_inode_loc(struct inode *inode, - struct ext4_iloc *iloc, int in_mem) +static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, + struct ext4_iloc *iloc, int in_mem, + ext4_fsblk_t *ret_block) { struct ext4_group_desc *gdp; struct buffer_head *bh; - struct super_block *sb = inode->i_sb; ext4_fsblk_t block; struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; - if (inode->i_ino < EXT4_ROOT_INO || - inode->i_ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) + if (ino < EXT4_ROOT_INO || + ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) return -EFSCORRUPTED; - iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); + iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); if (!gdp) return -EIO; @@ -4276,7 +4311,7 @@ static int __ext4_get_inode_loc(struct inode *inode, * Figure out the offset within the block group inode table */ inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; - inode_offset = ((inode->i_ino - 1) % + inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)); block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); @@ -4289,16 +4324,7 @@ static int __ext4_get_inode_loc(struct inode *inode, if (!buffer_uptodate(bh)) { lock_buffer(bh); - /* - * If the buffer has the write error flag, we have failed - * to write out another inode in the same block. In this - * case, we don't have to read the block because we may - * read the old inode data successfully. - */ - if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) - set_buffer_uptodate(bh); - - if (buffer_uptodate(bh)) { + if (ext4_buffer_uptodate(bh)) { /* someone brought it uptodate while we waited */ unlock_buffer(bh); goto has_buffer; @@ -4369,7 +4395,7 @@ make_io: if (end > table) end = table; while (b <= end) - sb_breadahead_unmovable(sb, b++); + ext4_sb_breadahead_unmovable(sb, b++); } /* @@ -4377,16 +4403,14 @@ make_io: * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ - trace_ext4_load_inode(inode); - get_bh(bh); - bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); + trace_ext4_load_inode(sb, ino); + ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { simulate_eio: - ext4_error_inode_block(inode, block, EIO, - "unable to read itable block"); + if (ret_block) + *ret_block = block; brelse(bh); return -EIO; } @@ -4396,11 +4420,43 @@ has_buffer: return 0; } +static int __ext4_get_inode_loc_noinmem(struct inode *inode, + struct ext4_iloc *iloc) +{ + ext4_fsblk_t err_blk; + int ret; + + ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, 0, + &err_blk); + + if (ret == -EIO) + ext4_error_inode_block(inode, err_blk, EIO, + "unable to read itable block"); + + return ret; +} + int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) { + ext4_fsblk_t err_blk; + int ret; + /* We have all inode data except xattrs in memory here. */ - return __ext4_get_inode_loc(inode, iloc, - !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); + ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, + !ext4_test_inode_state(inode, EXT4_STATE_XATTR), &err_blk); + + if (ret == -EIO) + ext4_error_inode_block(inode, err_blk, EIO, + "unable to read itable block"); + + return ret; +} + + +int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, + struct ext4_iloc *iloc) +{ + return __ext4_get_inode_loc(sb, ino, iloc, 0, NULL); } static bool ext4_should_enable_dax(struct inode *inode) @@ -4566,7 +4622,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei = EXT4_I(inode); iloc.bh = NULL; - ret = __ext4_get_inode_loc(inode, &iloc, 0); + ret = __ext4_get_inode_loc_noinmem(inode, &iloc); if (ret < 0) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); @@ -4612,10 +4668,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, sizeof(gen)); } - if (!ext4_inode_csum_verify(inode, raw_inode, ei) || - ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) { - ext4_error_inode_err(inode, function, line, 0, EFSBADCRC, - "iget: checksum invalid"); + if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || + ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) && + (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) { + ext4_error_inode_err(inode, function, line, 0, + EFSBADCRC, "iget: checksum invalid"); ret = -EFSBADCRC; goto bad_inode; } @@ -4703,6 +4760,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, for (block = 0; block < EXT4_N_BLOCKS; block++) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); + ext4_fc_init_inode(&ei->vfs_inode); /* * Set transaction id's of transactions that have to be committed @@ -4768,9 +4826,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, goto bad_inode; } else if (!ext4_has_inline_data(inode)) { /* validate the block references in the inode */ - if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - (S_ISLNK(inode->i_mode) && - !ext4_inode_is_fast_symlink(inode))) { + if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) && + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode)))) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_check_inode(inode); else @@ -4971,6 +5030,12 @@ static int ext4_do_update_inode(handle_t *handle, if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); + err = ext4_inode_blocks_set(handle, raw_inode, ei); + if (err) { + spin_unlock(&ei->i_raw_lock); + goto out_brelse; + } + raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); i_gid = i_gid_read(inode); @@ -5004,11 +5069,6 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); - err = ext4_inode_blocks_set(handle, raw_inode, ei); - if (err) { - spin_unlock(&ei->i_raw_lock); - goto out_brelse; - } raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) @@ -5149,12 +5209,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; - err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, + err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid); } else { struct ext4_iloc iloc; - err = __ext4_get_inode_loc(inode, &iloc, 0); + err = __ext4_get_inode_loc_noinmem(inode, &iloc); if (err) return err; /* @@ -5278,6 +5338,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; } + ext4_fc_start_update(inode); if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { handle_t *handle; @@ -5301,6 +5362,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (error) { ext4_journal_stop(handle); + ext4_fc_stop_update(inode); return error; } /* Update corresponding info in inode so that everything is in @@ -5323,11 +5385,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (attr->ia_size > sbi->s_bitmap_maxbytes) + if (attr->ia_size > sbi->s_bitmap_maxbytes) { + ext4_fc_stop_update(inode); return -EFBIG; + } } - if (!S_ISREG(inode->i_mode)) + if (!S_ISREG(inode->i_mode)) { + ext4_fc_stop_update(inode); return -EINVAL; + } if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) inode_inc_iversion(inode); @@ -5351,7 +5417,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) rc = ext4_break_layouts(inode); if (rc) { up_write(&EXT4_I(inode)->i_mmap_sem); - return rc; + goto err_out; } if (attr->ia_size != inode->i_size) { @@ -5372,6 +5438,21 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) inode->i_mtime = current_time(inode); inode->i_ctime = inode->i_mtime; } + + if (shrink) + ext4_fc_track_range(inode, + (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> + inode->i_sb->s_blocksize_bits, + (oldsize > 0 ? oldsize - 1 : 0) >> + inode->i_sb->s_blocksize_bits); + else + ext4_fc_track_range( + inode, + (oldsize > 0 ? oldsize - 1 : oldsize) >> + inode->i_sb->s_blocksize_bits, + (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> + inode->i_sb->s_blocksize_bits); + down_write(&EXT4_I(inode)->i_data_sem); EXT4_I(inode)->i_disksize = attr->ia_size; rc = ext4_mark_inode_dirty(handle, inode); @@ -5430,9 +5511,11 @@ out_mmap_sem: rc = posix_acl_chmod(inode, inode->i_mode); err_out: - ext4_std_error(inode->i_sb, error); + if (error) + ext4_std_error(inode->i_sb, error); if (!error) error = rc; + ext4_fc_stop_update(inode); return error; } @@ -5614,6 +5697,8 @@ int ext4_mark_iloc_dirty(handle_t *handle, put_bh(iloc->bh); return -EIO; } + ext4_fc_track_inode(inode); + if (IS_I_VERSION(inode)) inode_inc_iversion(inode); @@ -5937,6 +6022,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) if (IS_ERR(handle)) return PTR_ERR(handle); + ext4_fc_mark_ineligible(inode->i_sb, + EXT4_FC_REASON_JOURNAL_FLAG_CHANGE); err = ext4_mark_inode_dirty(handle, inode); ext4_handle_sync(handle); ext4_journal_stop(handle); @@ -5977,9 +6064,17 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) if (err) goto out_ret; + /* + * On data journalling we skip straight to the transaction handle: + * there's no delalloc; page truncated will be checked later; the + * early return w/ all buffers mapped (calculates size/len) can't + * be used; and there's no dioread_nolock, so only ext4_get_block. + */ + if (ext4_should_journal_data(inode)) + goto retry_alloc; + /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && - !ext4_should_journal_data(inode) && !ext4_nonda_switch(inode->i_sb)) { do { err = block_page_mkwrite(vma, vmf, @@ -6005,6 +6100,9 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) /* * Return if we have all the buffers mapped. This avoids the need to do * journal_start/journal_stop which can block and take a long time + * + * This cannot be done for data journalling, as we have to add the + * inode to the transaction's list to writeprotect pages on commit. */ if (page_has_buffers(page)) { if (!ext4_walk_page_buffers(NULL, page_buffers(page), @@ -6029,16 +6127,42 @@ retry_alloc: ret = VM_FAULT_SIGBUS; goto out; } - err = block_page_mkwrite(vma, vmf, get_block); - if (!err && ext4_should_journal_data(inode)) { - if (ext4_walk_page_buffers(handle, page_buffers(page), 0, - PAGE_SIZE, NULL, do_journal_get_write_access)) { - unlock_page(page); + /* + * Data journalling can't use block_page_mkwrite() because it + * will set_buffer_dirty() before do_journal_get_write_access() + * thus might hit warning messages for dirty metadata buffers. + */ + if (!ext4_should_journal_data(inode)) { + err = block_page_mkwrite(vma, vmf, get_block); + } else { + lock_page(page); + size = i_size_read(inode); + /* Page got truncated from under us? */ + if (page->mapping != mapping || page_offset(page) > size) { + ret = VM_FAULT_NOPAGE; + goto out_error; + } + + if (page->index == size >> PAGE_SHIFT) + len = size & ~PAGE_MASK; + else + len = PAGE_SIZE; + + err = __block_write_begin(page, 0, len, ext4_get_block); + if (!err) { ret = VM_FAULT_SIGBUS; - ext4_journal_stop(handle); - goto out; + if (ext4_walk_page_buffers(handle, page_buffers(page), + 0, len, NULL, do_journal_get_write_access)) + goto out_error; + if (ext4_walk_page_buffers(handle, page_buffers(page), + 0, len, NULL, write_end_fn)) + goto out_error; + if (ext4_jbd2_inode_add_write(handle, inode, 0, len)) + goto out_error; + ext4_set_inode_state(inode, EXT4_STATE_JDATA); + } else { + unlock_page(page); } - ext4_set_inode_state(inode, EXT4_STATE_JDATA); } ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -6049,6 +6173,10 @@ out: up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(inode->i_sb); return ret; +out_error: + unlock_page(page); + ext4_journal_stop(handle); + goto out; } vm_fault_t ext4_filemap_fault(struct vm_fault *vmf) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 36eca3bc036a..f0381876a7e5 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -86,7 +86,7 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) i_size_write(inode2, isize); } -static void reset_inode_seed(struct inode *inode) +void ext4_reset_inode_seed(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -165,6 +165,7 @@ static long swap_inode_boot_loader(struct super_block *sb, err = -EINVAL; goto err_out; } + ext4_fc_start_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT); /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(inode, inode_bl); @@ -199,8 +200,8 @@ static long swap_inode_boot_loader(struct super_block *sb, inode->i_generation = prandom_u32(); inode_bl->i_generation = prandom_u32(); - reset_inode_seed(inode); - reset_inode_seed(inode_bl); + ext4_reset_inode_seed(inode); + ext4_reset_inode_seed(inode_bl); ext4_discard_preallocations(inode, 0); @@ -247,6 +248,7 @@ revert: err_out1: ext4_journal_stop(handle); + ext4_fc_stop_ineligible(sb); ext4_double_up_write_data_sem(inode, inode_bl); err_out: @@ -807,7 +809,7 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) return error; } -long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; @@ -1074,6 +1076,7 @@ mext_out: err = ext4_resize_fs(sb, n_blocks_count); if (EXT4_SB(sb)->s_journal) { + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); @@ -1308,6 +1311,17 @@ out: } } +long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + long ret; + + ext4_fc_start_update(file_inode(filp)); + ret = __ext4_ioctl(filp, cmd, arg); + ext4_fc_stop_update(file_inode(filp)); + + return ret; +} + #ifdef CONFIG_COMPAT long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 132c118d12e1..85abbfb98cbe 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -124,7 +124,7 @@ * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O * stripe=<value> option the group prealloc request is normalized to the - * the smallest multiple of the stripe value (sbi->s_stripe) which is + * smallest multiple of the stripe value (sbi->s_stripe) which is * greater than the default mb_group_prealloc. * * The regular allocator (using the buddy cache) supports a few tunables. @@ -619,11 +619,8 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, void *buddy; void *buddy2; - { - static int mb_check_counter; - if (mb_check_counter++ % 100 != 0) - return 0; - } + if (e4b->bd_info->bb_check_counter++ % 10) + return 0; while (order > 1) { buddy = mb_find_buddy(e4b, order, &max); @@ -1394,9 +1391,6 @@ void ext4_set_bits(void *bm, int cur, int len) } } -/* - * _________________________________________________________________ */ - static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) { if (mb_test_bit(*bit + side, bitmap)) { @@ -1508,14 +1502,16 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(sbi, block); - ext4_grp_locked_error(sb, e4b->bd_group, - inode ? inode->i_ino : 0, - blocknr, - "freeing already freed block " - "(bit %u); block bitmap corrupt.", - block); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, + blocknr, + "freeing already freed block (bit %u); block bitmap corrupt.", + block); + ext4_mark_group_bitmap_corrupted( + sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); + } mb_regenerate_buddy(e4b); goto done; } @@ -2019,7 +2015,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, /* * IF we have corrupt bitmap, we won't find any * free blocks even though group info says we - * we have free blocks + * have free blocks */ ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " @@ -3303,6 +3299,84 @@ out_err: } /* + * Idempotent helper for Ext4 fast commit replay path to set the state of + * blocks in bitmaps and update counters. + */ +void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, + int len, int state) +{ + struct buffer_head *bitmap_bh = NULL; + struct ext4_group_desc *gdp; + struct buffer_head *gdp_bh; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t group; + ext4_grpblk_t blkoff; + int i, clen, err; + int already; + + clen = EXT4_B2C(sbi, len); + + ext4_get_group_no_and_offset(sb, block, &group, &blkoff); + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); + bitmap_bh = NULL; + goto out_err; + } + + err = -EIO; + gdp = ext4_get_group_desc(sb, group, &gdp_bh); + if (!gdp) + goto out_err; + + ext4_lock_group(sb, group); + already = 0; + for (i = 0; i < clen; i++) + if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == !state) + already++; + + if (state) + ext4_set_bits(bitmap_bh->b_data, blkoff, clen); + else + mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen); + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, + group, gdp)); + } + if (state) + clen = ext4_free_group_clusters(sb, gdp) - clen + already; + else + clen = ext4_free_group_clusters(sb, gdp) + clen - already; + + ext4_free_group_clusters_set(sb, gdp, clen); + ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + + ext4_unlock_group(sb, group); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, group); + + atomic64_sub(len, + &sbi_array_rcu_deref(sbi, s_flex_groups, + flex_group)->free_clusters); + } + + err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); + if (err) + goto out_err; + sync_dirty_buffer(bitmap_bh); + err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); + sync_dirty_buffer(gdp_bh); + +out_err: + brelse(bitmap_bh); +} + +/* * here we normalize request for locality group * Group request are normalized to s_mb_group_prealloc, which goes to * s_strip if we set the same via mount option. @@ -4160,7 +4234,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, struct ext4_buddy e4b; int err; int busy = 0; - int free = 0; + int free, free_total = 0; mb_debug(sb, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) @@ -4188,8 +4262,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, INIT_LIST_HEAD(&list); repeat: + free = 0; ext4_lock_group(sb, group); - this_cpu_inc(discard_pa_seq); list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { spin_lock(&pa->pa_lock); @@ -4206,6 +4280,9 @@ repeat: /* seems this one can be freed ... */ ext4_mb_mark_pa_deleted(sb, pa); + if (!free) + this_cpu_inc(discard_pa_seq); + /* we can trust pa_free ... */ free += pa->pa_free; @@ -4215,22 +4292,6 @@ repeat: list_add(&pa->u.pa_tmp_list, &list); } - /* if we still need more blocks and some PAs were used, try again */ - if (free < needed && busy) { - busy = 0; - ext4_unlock_group(sb, group); - cond_resched(); - goto repeat; - } - - /* found anything to free? */ - if (list_empty(&list)) { - BUG_ON(free != 0); - mb_debug(sb, "Someone else may have freed PA for this group %u\n", - group); - goto out; - } - /* now free all selected PAs */ list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { @@ -4248,14 +4309,22 @@ repeat: call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } -out: + free_total += free; + + /* if we still need more blocks and some PAs were used, try again */ + if (free_total < needed && busy) { + ext4_unlock_group(sb, group); + cond_resched(); + busy = 0; + goto repeat; + } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); out_dbg: mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", - free, group, grp->bb_free); - return free; + free_total, group, grp->bb_free); + return free_total; } /* @@ -4283,6 +4352,9 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) return; } + if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) + return; + mb_debug(sb, "discard preallocation for inode %lu\n", inode->i_ino); trace_ext4_discard_preallocations(inode, @@ -4830,6 +4902,9 @@ out_dbg: return ret; } +static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, + struct ext4_allocation_request *ar, int *errp); + /* * Main entry point into mballoc to allocate blocks * it tries to use preallocation first, then falls back @@ -4851,6 +4926,8 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, sbi = EXT4_SB(sb); trace_ext4_request_blocks(ar); + if (sbi->s_mount_state & EXT4_FC_REPLAY) + return ext4_mb_new_blocks_simple(handle, ar, errp); /* Allow to use superuser reservation for quota file */ if (ext4_is_quota_file(ar->inode)) @@ -5078,6 +5155,102 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, return 0; } +/* + * Simple allocator for Ext4 fast commit replay path. It searches for blocks + * linearly starting at the goal block and also excludes the blocks which + * are going to be in use after fast commit replay. + */ +static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, + struct ext4_allocation_request *ar, int *errp) +{ + struct buffer_head *bitmap_bh; + struct super_block *sb = ar->inode->i_sb; + ext4_group_t group; + ext4_grpblk_t blkoff; + int i; + ext4_fsblk_t goal, block; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + goal = ar->goal; + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= ext4_blocks_count(es)) + goal = le32_to_cpu(es->s_first_data_block); + + ar->len = 0; + ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); + for (; group < ext4_get_groups_count(sb); group++) { + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + *errp = PTR_ERR(bitmap_bh); + pr_warn("Failed to read block bitmap\n"); + return 0; + } + + ext4_get_group_no_and_offset(sb, + max(ext4_group_first_block_no(sb, group), goal), + NULL, &blkoff); + i = mb_find_next_zero_bit(bitmap_bh->b_data, sb->s_blocksize, + blkoff); + brelse(bitmap_bh); + if (i >= sb->s_blocksize) + continue; + if (ext4_fc_replay_check_excluded(sb, + ext4_group_first_block_no(sb, group) + i)) + continue; + break; + } + + if (group >= ext4_get_groups_count(sb) && i >= sb->s_blocksize) + return 0; + + block = ext4_group_first_block_no(sb, group) + i; + ext4_mb_mark_bb(sb, block, 1, 1); + ar->len = 1; + + return block; +} + +static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, + unsigned long count) +{ + struct buffer_head *bitmap_bh; + struct super_block *sb = inode->i_sb; + struct ext4_group_desc *gdp; + struct buffer_head *gdp_bh; + ext4_group_t group; + ext4_grpblk_t blkoff; + int already_freed = 0, err, i; + + ext4_get_group_no_and_offset(sb, block, &group, &blkoff); + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); + pr_warn("Failed to read block bitmap\n"); + return; + } + gdp = ext4_get_group_desc(sb, group, &gdp_bh); + if (!gdp) + return; + + for (i = 0; i < count; i++) { + if (!mb_test_bit(blkoff + i, bitmap_bh->b_data)) + already_freed++; + } + mb_clear_bits(bitmap_bh->b_data, blkoff, count); + err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); + if (err) + return; + ext4_free_group_clusters_set( + sb, gdp, ext4_free_group_clusters(sb, gdp) + + count - already_freed); + ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); + sync_dirty_buffer(bitmap_bh); + sync_dirty_buffer(gdp_bh); + brelse(bitmap_bh); +} + /** * ext4_free_blocks() -- Free given blocks and update quota * @handle: handle for this transaction @@ -5104,6 +5277,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, int err = 0; int ret; + sbi = EXT4_SB(sb); + + if (sbi->s_mount_state & EXT4_FC_REPLAY) { + ext4_free_blocks_simple(inode, block, count); + return; + } + might_sleep(); if (bh) { if (block) @@ -5112,7 +5292,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, block = bh->b_blocknr; } - sbi = EXT4_SB(sb); if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index d34cb8c46655..795c3ff2907c 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -85,15 +85,11 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, } } - get_bh(*bh); lock_buffer(*bh); - (*bh)->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, *bh); - wait_on_buffer(*bh); - if (!buffer_uptodate(*bh)) { - ret = -EIO; + ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL); + if (ret) goto warn_exit; - } + mmp = (struct mmp_struct *)((*bh)->b_data); if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) { ret = -EFSCORRUPTED; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 0d601b822875..64a579734f93 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -215,7 +215,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) for (i = 0; i < nr; i++) { bh = arr[i]; if (!bh_uptodate_or_lock(bh)) { - err = bh_submit_read(bh); + err = ext4_read_bh(bh, 0, NULL); if (err) return err; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 0d74615fcce3..5159830dacb8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2553,7 +2553,7 @@ out: * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set * on regular files) and to avoid creating huge/slow non-HTREE directories. */ -static void ext4_inc_count(handle_t *handle, struct inode *inode) +static void ext4_inc_count(struct inode *inode) { inc_nlink(inode); if (is_dx(inode) && @@ -2565,7 +2565,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode) * If a directory had nlink == 1, then we should let it be 1. This indicates * directory has >EXT4_LINK_MAX subdirs. */ -static void ext4_dec_count(handle_t *handle, struct inode *inode) +static void ext4_dec_count(struct inode *inode) { if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) drop_nlink(inode); @@ -2610,7 +2610,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { handle_t *handle; - struct inode *inode; + struct inode *inode, *inode_save; int err, credits, retries = 0; err = dquot_initialize(dir); @@ -2628,7 +2628,11 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); + inode_save = inode; + ihold(inode_save); err = ext4_add_nondir(handle, dentry, &inode); + ext4_fc_track_create(inode_save, dentry); + iput(inode_save); } if (handle) ext4_journal_stop(handle); @@ -2643,7 +2647,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { handle_t *handle; - struct inode *inode; + struct inode *inode, *inode_save; int err, credits, retries = 0; err = dquot_initialize(dir); @@ -2660,7 +2664,12 @@ retry: if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ext4_special_inode_operations; + inode_save = inode; + ihold(inode_save); err = ext4_add_nondir(handle, dentry, &inode); + if (!err) + ext4_fc_track_create(inode_save, dentry); + iput(inode_save); } if (handle) ext4_journal_stop(handle); @@ -2739,7 +2748,7 @@ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, return ext4_next_entry(de, blocksize); } -static int ext4_init_new_dir(handle_t *handle, struct inode *dir, +int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct inode *inode) { struct buffer_head *dir_block = NULL; @@ -2824,7 +2833,9 @@ out_clear_inode: iput(inode); goto out_retry; } - ext4_inc_count(handle, dir); + ext4_fc_track_create(inode, dentry); + ext4_inc_count(dir); + ext4_update_dx_flag(dir); err = ext4_mark_inode_dirty(handle, dir); if (err) @@ -3162,8 +3173,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) retval = ext4_mark_inode_dirty(handle, inode); if (retval) goto end_rmdir; - ext4_dec_count(handle, dir); + ext4_dec_count(dir); ext4_update_dx_flag(dir); + ext4_fc_track_unlink(inode, dentry); retval = ext4_mark_inode_dirty(handle, dir); #ifdef CONFIG_UNICODE @@ -3184,42 +3196,32 @@ end_rmdir: return retval; } -static int ext4_unlink(struct inode *dir, struct dentry *dentry) +int __ext4_unlink(struct inode *dir, const struct qstr *d_name, + struct inode *inode) { - int retval; - struct inode *inode; + int retval = -ENOENT; struct buffer_head *bh; struct ext4_dir_entry_2 *de; handle_t *handle = NULL; + int skip_remove_dentry = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) - return -EIO; - - trace_ext4_unlink_enter(dir, dentry); - /* Initialize quotas before so that eventual writes go - * in separate transaction */ - retval = dquot_initialize(dir); - if (retval) - goto out_trace; - retval = dquot_initialize(d_inode(dentry)); - if (retval) - goto out_trace; - - bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); - if (IS_ERR(bh)) { - retval = PTR_ERR(bh); - goto out_trace; - } - if (!bh) { - retval = -ENOENT; - goto out_trace; - } + bh = ext4_find_entry(dir, d_name, &de, NULL); + if (IS_ERR(bh)) + return PTR_ERR(bh); - inode = d_inode(dentry); + if (!bh) + return -ENOENT; if (le32_to_cpu(de->inode) != inode->i_ino) { - retval = -EFSCORRUPTED; - goto out_bh; + /* + * It's okay if we find dont find dentry which matches + * the inode. That's because it might have gotten + * renamed to a different inode number + */ + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + skip_remove_dentry = 1; + else + goto out_bh; } handle = ext4_journal_start(dir, EXT4_HT_DIR, @@ -3232,17 +3234,21 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - retval = ext4_delete_entry(handle, dir, de, bh); - if (retval) - goto out_handle; - dir->i_ctime = dir->i_mtime = current_time(dir); - ext4_update_dx_flag(dir); - retval = ext4_mark_inode_dirty(handle, dir); - if (retval) - goto out_handle; + if (!skip_remove_dentry) { + retval = ext4_delete_entry(handle, dir, de, bh); + if (retval) + goto out_handle; + dir->i_ctime = dir->i_mtime = current_time(dir); + ext4_update_dx_flag(dir); + retval = ext4_mark_inode_dirty(handle, dir); + if (retval) + goto out_handle; + } else { + retval = 0; + } if (inode->i_nlink == 0) ext4_warning_inode(inode, "Deleting file '%.*s' with no links", - dentry->d_name.len, dentry->d_name.name); + d_name->len, d_name->name); else drop_nlink(inode); if (!inode->i_nlink) @@ -3250,6 +3256,35 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) inode->i_ctime = current_time(inode); retval = ext4_mark_inode_dirty(handle, inode); +out_handle: + ext4_journal_stop(handle); +out_bh: + brelse(bh); + return retval; +} + +static int ext4_unlink(struct inode *dir, struct dentry *dentry) +{ + int retval; + + if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + return -EIO; + + trace_ext4_unlink_enter(dir, dentry); + /* + * Initialize quotas before so that eventual writes go + * in separate transaction + */ + retval = dquot_initialize(dir); + if (retval) + goto out_trace; + retval = dquot_initialize(d_inode(dentry)); + if (retval) + goto out_trace; + + retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry)); + if (!retval) + ext4_fc_track_unlink(d_inode(dentry), dentry); #ifdef CONFIG_UNICODE /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid @@ -3261,10 +3296,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) d_invalidate(dentry); #endif -out_handle: - ext4_journal_stop(handle); -out_bh: - brelse(bh); out_trace: trace_ext4_unlink_exit(dentry, retval); return retval; @@ -3345,7 +3376,8 @@ static int ext4_symlink(struct inode *dir, */ drop_nlink(inode); err = ext4_orphan_add(handle, inode); - ext4_journal_stop(handle); + if (handle) + ext4_journal_stop(handle); handle = NULL; if (err) goto err_drop_inode; @@ -3399,29 +3431,10 @@ out_free_encrypted_link: return err; } -static int ext4_link(struct dentry *old_dentry, - struct inode *dir, struct dentry *dentry) +int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry) { handle_t *handle; - struct inode *inode = d_inode(old_dentry); int err, retries = 0; - - if (inode->i_nlink >= EXT4_LINK_MAX) - return -EMLINK; - - err = fscrypt_prepare_link(old_dentry, dir, dentry); - if (err) - return err; - - if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && - (!projid_eq(EXT4_I(dir)->i_projid, - EXT4_I(old_dentry->d_inode)->i_projid))) - return -EXDEV; - - err = dquot_initialize(dir); - if (err) - return err; - retry: handle = ext4_journal_start(dir, EXT4_HT_DIR, (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + @@ -3433,11 +3446,12 @@ retry: ext4_handle_sync(handle); inode->i_ctime = current_time(inode); - ext4_inc_count(handle, inode); + ext4_inc_count(inode); ihold(inode); err = ext4_add_entry(handle, dentry, inode); if (!err) { + ext4_fc_track_link(inode, dentry); err = ext4_mark_inode_dirty(handle, inode); /* this can happen only for tmpfile being * linked the first time @@ -3455,6 +3469,29 @@ retry: return err; } +static int ext4_link(struct dentry *old_dentry, + struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + int err; + + if (inode->i_nlink >= EXT4_LINK_MAX) + return -EMLINK; + + err = fscrypt_prepare_link(old_dentry, dir, dentry); + if (err) + return err; + + if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && + (!projid_eq(EXT4_I(dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = dquot_initialize(dir); + if (err) + return err; + return __ext4_link(dir, inode, dentry); +} /* * Try to find buffer head where contains the parent block. @@ -3630,9 +3667,9 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) { if (ent->dir_nlink_delta) { if (ent->dir_nlink_delta == -1) - ext4_dec_count(handle, ent->dir); + ext4_dec_count(ent->dir); else - ext4_inc_count(handle, ent->dir); + ext4_inc_count(ent->dir); ext4_mark_inode_dirty(handle, ent->dir); } } @@ -3844,7 +3881,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, } if (new.inode) { - ext4_dec_count(handle, new.inode); + ext4_dec_count(new.inode); new.inode->i_ctime = current_time(new.inode); } old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir); @@ -3854,14 +3891,14 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (retval) goto end_rename; - ext4_dec_count(handle, old.dir); + ext4_dec_count(old.dir); if (new.inode) { /* checked ext4_empty_dir above, can't have another * parent, ext4_dec_count() won't work for many-linked * dirs */ clear_nlink(new.inode); } else { - ext4_inc_count(handle, new.dir); + ext4_inc_count(new.dir); ext4_update_dx_flag(new.dir); retval = ext4_mark_inode_dirty(handle, new.dir); if (unlikely(retval)) @@ -3871,6 +3908,22 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, retval = ext4_mark_inode_dirty(handle, old.dir); if (unlikely(retval)) goto end_rename; + + if (S_ISDIR(old.inode->i_mode)) { + /* + * We disable fast commits here that's because the + * replay code is not yet capable of changing dot dot + * dirents in directories. + */ + ext4_fc_mark_ineligible(old.inode->i_sb, + EXT4_FC_REASON_RENAME_DIR); + } else { + if (new.inode) + ext4_fc_track_unlink(new.inode, new.dentry); + ext4_fc_track_link(old.inode, new.dentry); + ext4_fc_track_unlink(old.inode, old.dentry); + } + if (new.inode) { retval = ext4_mark_inode_dirty(handle, new.inode); if (unlikely(retval)) @@ -4014,7 +4067,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, retval = ext4_mark_inode_dirty(handle, new.inode); if (unlikely(retval)) goto end_rename; - + ext4_fc_mark_ineligible(new.inode->i_sb, + EXT4_FC_REASON_CROSS_RENAME); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a50b51270ea9..928700d57eb6 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -843,8 +843,10 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, BUFFER_TRACE(dind, "get_write_access"); err = ext4_journal_get_write_access(handle, dind); - if (unlikely(err)) + if (unlikely(err)) { ext4_std_error(sb, err); + goto errout; + } /* ext4_reserve_inode_write() gets a reference on the iloc */ err = ext4_reserve_inode_write(handle, inode, &iloc); @@ -1243,7 +1245,7 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) if (unlikely(!bh)) return NULL; if (!bh_uptodate_or_lock(bh)) { - if (bh_submit_read(bh) < 0) { + if (ext4_read_bh(bh, 0, NULL) < 0) { brelse(bh); return NULL; } @@ -1806,8 +1808,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, o_blocks_count + add, add); /* See if the device is actually as big as what was requested */ - bh = sb_bread(sb, o_blocks_count + add - 1); - if (!bh) { + bh = ext4_sb_bread(sb, o_blocks_count + add - 1, 0); + if (IS_ERR(bh)) { ext4_warning(sb, "can't read last block, resize aborted"); return -ENOSPC; } @@ -1932,8 +1934,8 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) int meta_bg; /* See if the device is actually as big as what was requested */ - bh = sb_bread(sb, n_blocks_count - 1); - if (!bh) { + bh = ext4_sb_bread(sb, n_blocks_count - 1, 0); + if (IS_ERR(bh)) { ext4_warning(sb, "can't read last block, resize aborted"); return -ENOSPC; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8b2736283481..2fe141ff3c7e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -141,27 +141,115 @@ MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) + +static inline void __ext4_read_bh(struct buffer_head *bh, int op_flags, + bh_end_io_t *end_io) +{ + /* + * buffer's verified bit is no longer valid after reading from + * disk again due to write out error, clear it to make sure we + * recheck the buffer contents. + */ + clear_buffer_verified(bh); + + bh->b_end_io = end_io ? end_io : end_buffer_read_sync; + get_bh(bh); + submit_bh(REQ_OP_READ, op_flags, bh); +} + +void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags, + bh_end_io_t *end_io) +{ + BUG_ON(!buffer_locked(bh)); + + if (ext4_buffer_uptodate(bh)) { + unlock_buffer(bh); + return; + } + __ext4_read_bh(bh, op_flags, end_io); +} + +int ext4_read_bh(struct buffer_head *bh, int op_flags, bh_end_io_t *end_io) +{ + BUG_ON(!buffer_locked(bh)); + + if (ext4_buffer_uptodate(bh)) { + unlock_buffer(bh); + return 0; + } + + __ext4_read_bh(bh, op_flags, end_io); + + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return 0; + return -EIO; +} + +int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait) +{ + if (trylock_buffer(bh)) { + if (wait) + return ext4_read_bh(bh, op_flags, NULL); + ext4_read_bh_nowait(bh, op_flags, NULL); + return 0; + } + if (wait) { + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return 0; + return -EIO; + } + return 0; +} + /* - * This works like sb_bread() except it uses ERR_PTR for error + * This works like __bread_gfp() except it uses ERR_PTR for error * returns. Currently with sb_bread it's impossible to distinguish * between ENOMEM and EIO situations (since both result in a NULL * return. */ -struct buffer_head * -ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags) +static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, + sector_t block, int op_flags, + gfp_t gfp) { - struct buffer_head *bh = sb_getblk(sb, block); + struct buffer_head *bh; + int ret; + bh = sb_getblk_gfp(sb, block, gfp); if (bh == NULL) return ERR_PTR(-ENOMEM); if (ext4_buffer_uptodate(bh)) return bh; - ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh); - wait_on_buffer(bh); - if (buffer_uptodate(bh)) - return bh; - put_bh(bh); - return ERR_PTR(-EIO); + + ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true); + if (ret) { + put_bh(bh); + return ERR_PTR(ret); + } + return bh; +} + +struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, + int op_flags) +{ + return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE); +} + +struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, + sector_t block) +{ + return __ext4_sb_bread_gfp(sb, block, 0, 0); +} + +void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) +{ + struct buffer_head *bh = sb_getblk_gfp(sb, block, 0); + + if (likely(bh)) { + ext4_read_bh_lock(bh, REQ_RAHEAD, false); + brelse(bh); + } } static int ext4_verify_csum_type(struct super_block *sb, @@ -201,7 +289,18 @@ void ext4_superblock_csum_set(struct super_block *sb) if (!ext4_has_metadata_csum(sb)) return; + /* + * Locking the superblock prevents the scenario + * where: + * 1) a first thread pauses during checksum calculation. + * 2) a second thread updates the superblock, recalculates + * the checksum, and updates s_checksum + * 3) the first thread resumes and finishes its checksum calculation + * and updates s_checksum with a potentially stale or torn value. + */ + lock_buffer(EXT4_SB(sb)->s_sbh); es->s_checksum = ext4_superblock_csum(sb, es); + unlock_buffer(EXT4_SB(sb)->s_sbh); } ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, @@ -472,6 +571,89 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) spin_unlock(&sbi->s_md_lock); } +/* + * This writepage callback for write_cache_pages() + * takes care of a few cases after page cleaning. + * + * write_cache_pages() already checks for dirty pages + * and calls clear_page_dirty_for_io(), which we want, + * to write protect the pages. + * + * However, we may have to redirty a page (see below.) + */ +static int ext4_journalled_writepage_callback(struct page *page, + struct writeback_control *wbc, + void *data) +{ + transaction_t *transaction = (transaction_t *) data; + struct buffer_head *bh, *head; + struct journal_head *jh; + + bh = head = page_buffers(page); + do { + /* + * We have to redirty a page in these cases: + * 1) If buffer is dirty, it means the page was dirty because it + * contains a buffer that needs checkpointing. So the dirty bit + * needs to be preserved so that checkpointing writes the buffer + * properly. + * 2) If buffer is not part of the committing transaction + * (we may have just accidentally come across this buffer because + * inode range tracking is not exact) or if the currently running + * transaction already contains this buffer as well, dirty bit + * needs to be preserved so that the buffer gets writeprotected + * properly on running transaction's commit. + */ + jh = bh2jh(bh); + if (buffer_dirty(bh) || + (jh && (jh->b_transaction != transaction || + jh->b_next_transaction))) { + redirty_page_for_writepage(wbc, page); + goto out; + } + } while ((bh = bh->b_this_page) != head); + +out: + return AOP_WRITEPAGE_ACTIVATE; +} + +static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) +{ + struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .range_start = jinode->i_dirty_start, + .range_end = jinode->i_dirty_end, + }; + + return write_cache_pages(mapping, &wbc, + ext4_journalled_writepage_callback, + jinode->i_transaction); +} + +static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) +{ + int ret; + + if (ext4_should_journal_data(jinode->i_vfs_inode)) + ret = ext4_journalled_submit_inode_data_buffers(jinode); + else + ret = jbd2_journal_submit_inode_data_buffers(jinode); + + return ret; +} + +static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) +{ + int ret = 0; + + if (!ext4_should_journal_data(jinode->i_vfs_inode)) + ret = jbd2_journal_finish_inode_data_buffers(jinode); + + return ret; +} + static bool system_going_down(void) { return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF @@ -939,10 +1121,10 @@ static void ext4_blkdev_put(struct block_device *bdev) static void ext4_blkdev_remove(struct ext4_sb_info *sbi) { struct block_device *bdev; - bdev = sbi->journal_bdev; + bdev = sbi->s_journal_bdev; if (bdev) { ext4_blkdev_put(bdev); - sbi->journal_bdev = NULL; + sbi->s_journal_bdev = NULL; } } @@ -1073,14 +1255,14 @@ static void ext4_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); - if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { + if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) { /* * Invalidate the journal device's buffers. We don't want them * floating about in memory - the physical journal device may * hotswapped, and it breaks the `ro-after' testing code. */ - sync_blockdev(sbi->journal_bdev); - invalidate_bdev(sbi->journal_bdev); + sync_blockdev(sbi->s_journal_bdev); + invalidate_bdev(sbi->s_journal_bdev); ext4_blkdev_remove(sbi); } @@ -1149,6 +1331,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_datasync_tid = 0; atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); + ext4_fc_init_inode(&ei->vfs_inode); + mutex_init(&ei->i_fc_lock); return &ei->vfs_inode; } @@ -1166,6 +1350,10 @@ static int ext4_drop_inode(struct inode *inode) static void ext4_free_in_core_inode(struct inode *inode) { fscrypt_free_inode(inode); + if (!list_empty(&(EXT4_I(inode)->i_fc_list))) { + pr_warn("%s: inode %ld still in fc list", + __func__, inode->i_ino); + } kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); } @@ -1191,6 +1379,7 @@ static void init_once(void *foo) init_rwsem(&ei->i_data_sem); init_rwsem(&ei->i_mmap_sem); inode_init_once(&ei->vfs_inode); + ext4_fc_init_inode(&ei->vfs_inode); } static int __init init_inodecache(void) @@ -1219,6 +1408,7 @@ static void destroy_inodecache(void) void ext4_clear_inode(struct inode *inode) { + ext4_fc_del(inode); invalidate_inode_buffers(inode); clear_inode(inode); ext4_discard_preallocations(inode, 0); @@ -1526,7 +1716,11 @@ enum { Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, - Opt_prefetch_block_bitmaps, + Opt_prefetch_block_bitmaps, Opt_no_fc, +#ifdef CONFIG_EXT4_DEBUG + Opt_fc_debug_max_replay, +#endif + Opt_fc_debug_force }; static const match_table_t tokens = { @@ -1613,6 +1807,11 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable=%u"}, {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, + {Opt_no_fc, "no_fc"}, + {Opt_fc_debug_force, "fc_debug_force"}, +#ifdef CONFIG_EXT4_DEBUG + {Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"}, +#endif {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, @@ -1739,6 +1938,7 @@ static int clear_qf_name(struct super_block *sb, int qtype) #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) #define MOPT_STRING 0x0400 #define MOPT_SKIP 0x0800 +#define MOPT_2 0x1000 static const struct mount_opts { int token; @@ -1839,6 +2039,13 @@ static const struct mount_opts { {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, + {Opt_no_fc, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, + MOPT_CLEAR | MOPT_2 | MOPT_EXT4_ONLY}, + {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, + MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, +#ifdef CONFIG_EXT4_DEBUG + {Opt_fc_debug_max_replay, 0, MOPT_GTE0}, +#endif {Opt_err, 0, 0} }; @@ -2048,6 +2255,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, sbi->s_li_wait_mult = arg; } else if (token == Opt_max_dir_size_kb) { sbi->s_max_dir_size_kb = arg; +#ifdef CONFIG_EXT4_DEBUG + } else if (token == Opt_fc_debug_max_replay) { + sbi->s_fc_debug_max_replay = arg; +#endif } else if (token == Opt_stripe) { sbi->s_stripe = arg; } else if (token == Opt_resuid) { @@ -2216,10 +2427,17 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, WARN_ON(1); return -1; } - if (arg != 0) - sbi->s_mount_opt |= m->mount_opt; - else - sbi->s_mount_opt &= ~m->mount_opt; + if (m->flags & MOPT_2) { + if (arg != 0) + sbi->s_mount_opt2 |= m->mount_opt; + else + sbi->s_mount_opt2 &= ~m->mount_opt; + } else { + if (arg != 0) + sbi->s_mount_opt |= m->mount_opt; + else + sbi->s_mount_opt &= ~m->mount_opt; + } } return 1; } @@ -2436,6 +2654,9 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PUTS("dax=inode"); } + if (test_opt2(sb, JOURNAL_FAST_COMMIT)) + SEQ_OPTS_PUTS("fast_commit"); + ext4_show_quota_options(seq, sb); return 0; } @@ -3754,7 +3975,7 @@ int ext4_calculate_overhead(struct super_block *sb) * Add the internal journal blocks whether the journal has been * loaded or not */ - if (sbi->s_journal && !sbi->journal_bdev) + if (sbi->s_journal && !sbi->s_journal_bdev) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ @@ -3868,8 +4089,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) logical_sb_block = sb_block; } - if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) { + bh = ext4_sb_bread_unmovable(sb, logical_sb_block); + if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "unable to read superblock"); + ret = PTR_ERR(bh); + bh = NULL; goto out_fail; } /* @@ -3936,6 +4160,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif + if (ext4_has_feature_fast_commit(sb)) + set_opt2(sb, JOURNAL_FAST_COMMIT); /* don't forget to enable journal_csum when metadata_csum is enabled. */ if (ext4_has_metadata_csum(sb)) set_opt(sb, JOURNAL_CHECKSUM); @@ -4265,10 +4491,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); - bh = sb_bread_unmovable(sb, logical_sb_block); - if (!bh) { + bh = ext4_sb_bread_unmovable(sb, logical_sb_block); + if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try"); + ret = PTR_ERR(bh); + bh = NULL; goto failed_mount; } es = (struct ext4_super_block *)(bh->b_data + offset); @@ -4480,18 +4708,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Pre-read the descriptors into the buffer cache */ for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); - sb_breadahead_unmovable(sb, block); + ext4_sb_breadahead_unmovable(sb, block); } for (i = 0; i < db_count; i++) { struct buffer_head *bh; block = descriptor_loc(sb, logical_sb_block, i); - bh = sb_bread_unmovable(sb, block); - if (!bh) { + bh = ext4_sb_bread_unmovable(sb, block); + if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "can't read group descriptor %d", i); db_count = i; + ret = PTR_ERR(bh); + bh = NULL; goto failed_mount2; } rcu_read_lock(); @@ -4539,6 +4769,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); + /* Initialize fast commit stuff */ + atomic_set(&sbi->s_fc_subtid, 0); + atomic_set(&sbi->s_fc_ineligible_updates, 0); + INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); + INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); + INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); + INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); + sbi->s_fc_bytes = 0; + sbi->s_mount_state &= ~EXT4_FC_INELIGIBLE; + sbi->s_mount_state &= ~EXT4_FC_COMMITTING; + spin_lock_init(&sbi->s_fc_lock); + memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); + sbi->s_fc_replay_state.fc_regions = NULL; + sbi->s_fc_replay_state.fc_regions_size = 0; + sbi->s_fc_replay_state.fc_regions_used = 0; + sbi->s_fc_replay_state.fc_regions_valid = 0; + sbi->s_fc_replay_state.fc_modified_inodes = NULL; + sbi->s_fc_replay_state.fc_modified_inodes_size = 0; + sbi->s_fc_replay_state.fc_modified_inodes_used = 0; + sb->s_root = NULL; needs_recovery = (es->s_last_orphan != 0 || @@ -4588,6 +4838,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); + clear_opt2(sb, JOURNAL_FAST_COMMIT); sbi->s_journal = NULL; needs_recovery = 0; goto no_journal; @@ -4646,6 +4897,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; + sbi->s_journal->j_submit_inode_data_buffers = + ext4_journal_submit_inode_data_buffers; + sbi->s_journal->j_finish_inode_data_buffers = + ext4_journal_finish_inode_data_buffers; no_journal: if (!test_opt(sb, NO_MBCACHE)) { @@ -4748,6 +5003,7 @@ no_journal: goto failed_mount4a; } } + ext4_fc_replay_cleanup(sb); ext4_ext_init(sb); err = ext4_mb_init(sb); @@ -4814,9 +5070,8 @@ no_journal: * used to detect the metadata async write error. */ spin_lock_init(&sbi->s_bdev_wb_lock); - if (!sb_rdonly(sb)) - errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, - &sbi->s_bdev_wb_err); + errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, + &sbi->s_bdev_wb_err); sb->s_bdev->bd_super = sb; EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); @@ -4872,6 +5127,7 @@ cantfind_ext4: failed_mount8: ext4_unregister_sysfs(sb); + kobject_put(&sbi->s_kobj); failed_mount7: ext4_unregister_li_request(sb); failed_mount6: @@ -4960,6 +5216,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_commit_interval = sbi->s_commit_interval; journal->j_min_batch_time = sbi->s_min_batch_time; journal->j_max_batch_time = sbi->s_max_batch_time; + ext4_fc_init(sb, journal); write_lock(&journal->j_state_lock); if (test_opt(sb, BARRIER)) @@ -5102,9 +5359,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, goto out_bdev; } journal->j_private = sb; - ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer); - wait_on_buffer(journal->j_sb_buffer); - if (!buffer_uptodate(journal->j_sb_buffer)) { + if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) { ext4_msg(sb, KERN_ERR, "I/O error on journal device"); goto out_journal; } @@ -5114,7 +5369,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, be32_to_cpu(journal->j_superblock->s_nr_users)); goto out_journal; } - EXT4_SB(sb)->journal_bdev = bdev; + EXT4_SB(sb)->s_journal_bdev = bdev; ext4_init_journal_params(sb, journal); return journal; @@ -5708,14 +5963,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } /* - * Update the original bdev mapping's wb_err value - * which could be used to detect the metadata async - * write error. - */ - errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, - &sbi->s_bdev_wb_err); - - /* * Mounting a RDONLY partition read-write, so reread * and store the current valid flag. (It may have * been changed by e2fsck since we originally mounted @@ -5760,7 +6007,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * Releasing of existing data is done when we are sure remount will * succeed. */ - if (test_opt(sb, BLOCK_VALIDITY) && !sbi->system_blks) { + if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) { err = ext4_setup_system_zone(sb); if (err) goto restore_opts; @@ -5786,7 +6033,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } } #endif - if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks) + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); /* @@ -5809,7 +6056,7 @@ restore_opts: sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; - if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks) + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; @@ -5897,8 +6144,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = EXT4_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); - buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; - buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + buf->f_fsid = u64_to_fsid(fsid); #ifdef CONFIG_QUOTA if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && @@ -6042,6 +6288,11 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, /* Quotafile not on the same filesystem? */ if (path->dentry->d_sb != sb) return -EXDEV; + + /* Quota already enabled for this file? */ + if (IS_NOQUOTA(d_inode(path->dentry))) + return -EBUSY; + /* Journaling quota? */ if (EXT4_SB(sb)->s_qf_names[type]) { /* Quotafile not in fs root? */ @@ -6309,6 +6560,10 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, brelse(bh); out: if (inode->i_size < off + len) { + ext4_fc_track_range(inode, + (inode->i_size > 0 ? inode->i_size - 1 : 0) + >> inode->i_sb->s_blocksize_bits, + (off + len) >> inode->i_sb->s_blocksize_bits); i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; err2 = ext4_mark_inode_dirty(handle, inode); @@ -6437,6 +6692,11 @@ static int __init ext4_init_fs(void) err = init_inodecache(); if (err) goto out1; + + err = ext4_fc_init_dentry_cache(); + if (err) + goto out05; + register_as_ext3(); register_as_ext2(); err = register_filesystem(&ext4_fs_type); @@ -6447,6 +6707,7 @@ static int __init ext4_init_fs(void) out: unregister_as_ext2(); unregister_as_ext3(); +out05: destroy_inodecache(); out1: ext4_exit_mballoc(); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index bfabb799fa45..5ff33d18996a 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -521,6 +521,8 @@ int ext4_register_sysfs(struct super_block *sb) proc_create_single_data("es_shrinker_info", S_IRUGO, sbi->s_proc, ext4_seq_es_shrinker_info_show, sb); + proc_create_single_data("fc_info", 0444, sbi->s_proc, + ext4_fc_info_show, sb); proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_ops, sb); } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index cba4b877c606..6127e94ea4f5 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2419,6 +2419,7 @@ retry_inode: if (IS_SYNC(inode)) ext4_handle_sync(handle); } + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); cleanup: brelse(is.iloc.bh); @@ -2496,6 +2497,7 @@ retry: if (error == 0) error = error2; } + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); return error; } @@ -2928,6 +2930,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, error); goto cleanup; } + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); } error = 0; cleanup: diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0c958fed3392..00eff2f51807 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1442,8 +1442,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) } buf->f_namelen = F2FS_NAME_LEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); #ifdef CONFIG_QUOTA if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) && diff --git a/fs/fat/inode.c b/fs/fat/inode.c index a0cf99debb1e..bab9b202b496 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -836,8 +836,7 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = sbi->max_cluster - FAT_START_ENT; buf->f_bfree = sbi->free_clusters; buf->f_bavail = sbi->free_clusters; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = (sbi->options.isvfat ? FAT_LFN_LEN : 12) * NLS_MAX_CHARSET_SIZE; diff --git a/fs/file_table.c b/fs/file_table.c index 656647f9575a..709ada3151da 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -339,7 +339,7 @@ void fput_many(struct file *file, unsigned int refs) if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { init_task_work(&file->f_u.fu_rcuhead, ____fput); - if (!task_work_add(task, &file->f_u.fu_rcuhead, true)) + if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME)) return; /* * After this task has run exit_task_work(), diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 774b2618018a..40ce9a1c12e5 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -8,7 +8,7 @@ config FUSE_FS There's also a companion library: libfuse2. This library is available from the FUSE homepage: - <http://fuse.sourceforge.net/> + <https://github.com/libfuse/> although chances are your distribution already has that library installed if you've installed the "fuse" package itself. @@ -38,3 +38,17 @@ config VIRTIO_FS If you want to share files between guests or with the host, answer Y or M. + +config FUSE_DAX + bool "Virtio Filesystem Direct Host Memory Access support" + default y + select INTERVAL_TREE + depends on VIRTIO_FS + depends on FS_DAX + depends on DAX_DRIVER + help + This allows bypassing guest page cache and allows mapping host page + cache directly in guest address space. + + If you want to allow mounting a Virtio Filesystem with the "dax" + option, answer Y. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 3e8cebfb59b7..8c7021fb2cd4 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -7,5 +7,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o -fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o -virtiofs-y += virtio_fs.o +fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o +fuse-$(CONFIG_FUSE_DAX) += dax.o + +virtiofs-y := virtio_fs.o diff --git a/fs/fuse/control.c b/fs/fuse/control.c index a1303ad303ba..cc7e94d73c6c 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -164,6 +164,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, { unsigned val; struct fuse_conn *fc; + struct fuse_mount *fm; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, @@ -174,18 +175,27 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, if (!fc) goto out; + down_read(&fc->killsb); spin_lock(&fc->bg_lock); fc->congestion_threshold = val; - if (fc->sb) { + + /* + * Get any fuse_mount belonging to this fuse_conn; s_bdi is + * shared between all of them + */ + + if (!list_empty(&fc->mounts)) { + fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry); if (fc->num_background < fc->congestion_threshold) { - clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); + clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); } else { - set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); + set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); } } spin_unlock(&fc->bg_lock); + up_read(&fc->killsb); fuse_conn_put(fc); out: return ret; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 2cc17816d7b1..45082269e698 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -57,6 +57,7 @@ struct cuse_conn { struct list_head list; /* linked on cuse_conntbl */ + struct fuse_mount fm; /* Dummy mount referencing fc */ struct fuse_conn fc; /* fuse connection */ struct cdev *cdev; /* associated character device */ struct device *dev; /* device representing @cdev */ @@ -134,7 +135,7 @@ static int cuse_open(struct inode *inode, struct file *file) * Generic permission check is already done against the chrdev * file, proceed to open. */ - rc = fuse_do_open(&cc->fc, 0, file, 0); + rc = fuse_do_open(&cc->fm, 0, file, 0); if (rc) fuse_conn_put(&cc->fc); return rc; @@ -143,10 +144,10 @@ static int cuse_open(struct inode *inode, struct file *file) static int cuse_release(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; fuse_sync_release(NULL, ff, file->f_flags); - fuse_conn_put(fc); + fuse_conn_put(fm->fc); return 0; } @@ -155,7 +156,7 @@ static long cuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fuse_file *ff = file->private_data; - struct cuse_conn *cc = fc_to_cc(ff->fc); + struct cuse_conn *cc = fc_to_cc(ff->fm->fc); unsigned int flags = 0; if (cc->unrestricted_ioctl) @@ -168,7 +169,7 @@ static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fuse_file *ff = file->private_data; - struct cuse_conn *cc = fc_to_cc(ff->fc); + struct cuse_conn *cc = fc_to_cc(ff->fm->fc); unsigned int flags = FUSE_IOCTL_COMPAT; if (cc->unrestricted_ioctl) @@ -313,9 +314,10 @@ struct cuse_init_args { * required data structures for it. Please read the comment at the * top of this file for high level overview. */ -static void cuse_process_init_reply(struct fuse_conn *fc, +static void cuse_process_init_reply(struct fuse_mount *fm, struct fuse_args *args, int error) { + struct fuse_conn *fc = fm->fc; struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args); struct fuse_args_pages *ap = &ia->ap; struct cuse_conn *cc = fc_to_cc(fc), *pos; @@ -424,7 +426,7 @@ static int cuse_send_init(struct cuse_conn *cc) { int rc; struct page *page; - struct fuse_conn *fc = &cc->fc; + struct fuse_mount *fm = &cc->fm; struct cuse_init_args *ia; struct fuse_args_pages *ap; @@ -460,7 +462,7 @@ static int cuse_send_init(struct cuse_conn *cc) ia->desc.length = ap->args.out_args[1].size; ap->args.end = cuse_process_init_reply; - rc = fuse_simple_background(fc, &ap->args, GFP_KERNEL); + rc = fuse_simple_background(fm, &ap->args, GFP_KERNEL); if (rc) { kfree(ia); err_free_page: @@ -506,7 +508,8 @@ static int cuse_channel_open(struct inode *inode, struct file *file) * Limit the cuse channel to requests that can * be represented in file->f_cred->user_ns. */ - fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL); + fuse_conn_init(&cc->fc, &cc->fm, file->f_cred->user_ns, + &fuse_dev_fiq_ops, NULL); fud = fuse_dev_alloc_install(&cc->fc); if (!fud) { diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c new file mode 100644 index 000000000000..ff99ab2a3c43 --- /dev/null +++ b/fs/fuse/dax.c @@ -0,0 +1,1365 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * dax: direct host memory access + * Copyright (C) 2020 Red Hat, Inc. + */ + +#include "fuse_i.h" + +#include <linux/delay.h> +#include <linux/dax.h> +#include <linux/uio.h> +#include <linux/pfn_t.h> +#include <linux/iomap.h> +#include <linux/interval_tree.h> + +/* + * Default memory range size. A power of 2 so it agrees with common FUSE_INIT + * map_alignment values 4KB and 64KB. + */ +#define FUSE_DAX_SHIFT 21 +#define FUSE_DAX_SZ (1 << FUSE_DAX_SHIFT) +#define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE) + +/* Number of ranges reclaimer will try to free in one invocation */ +#define FUSE_DAX_RECLAIM_CHUNK (10) + +/* + * Dax memory reclaim threshold in percetage of total ranges. When free + * number of free ranges drops below this threshold, reclaim can trigger + * Default is 20% + */ +#define FUSE_DAX_RECLAIM_THRESHOLD (20) + +/** Translation information for file offsets to DAX window offsets */ +struct fuse_dax_mapping { + /* Pointer to inode where this memory range is mapped */ + struct inode *inode; + + /* Will connect in fcd->free_ranges to keep track of free memory */ + struct list_head list; + + /* For interval tree in file/inode */ + struct interval_tree_node itn; + + /* Will connect in fc->busy_ranges to keep track busy memory */ + struct list_head busy_list; + + /** Position in DAX window */ + u64 window_offset; + + /** Length of mapping, in bytes */ + loff_t length; + + /* Is this mapping read-only or read-write */ + bool writable; + + /* reference count when the mapping is used by dax iomap. */ + refcount_t refcnt; +}; + +/* Per-inode dax map */ +struct fuse_inode_dax { + /* Semaphore to protect modifications to the dmap tree */ + struct rw_semaphore sem; + + /* Sorted rb tree of struct fuse_dax_mapping elements */ + struct rb_root_cached tree; + unsigned long nr; +}; + +struct fuse_conn_dax { + /* DAX device */ + struct dax_device *dev; + + /* Lock protecting accessess to members of this structure */ + spinlock_t lock; + + /* List of memory ranges which are busy */ + unsigned long nr_busy_ranges; + struct list_head busy_ranges; + + /* Worker to free up memory ranges */ + struct delayed_work free_work; + + /* Wait queue for a dax range to become free */ + wait_queue_head_t range_waitq; + + /* DAX Window Free Ranges */ + long nr_free_ranges; + struct list_head free_ranges; + + unsigned long nr_ranges; +}; + +static inline struct fuse_dax_mapping * +node_to_dmap(struct interval_tree_node *node) +{ + if (!node) + return NULL; + + return container_of(node, struct fuse_dax_mapping, itn); +} + +static struct fuse_dax_mapping * +alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode); + +static void +__kick_dmap_free_worker(struct fuse_conn_dax *fcd, unsigned long delay_ms) +{ + unsigned long free_threshold; + + /* If number of free ranges are below threshold, start reclaim */ + free_threshold = max_t(unsigned long, fcd->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD / 100, + 1); + if (fcd->nr_free_ranges < free_threshold) + queue_delayed_work(system_long_wq, &fcd->free_work, + msecs_to_jiffies(delay_ms)); +} + +static void kick_dmap_free_worker(struct fuse_conn_dax *fcd, + unsigned long delay_ms) +{ + spin_lock(&fcd->lock); + __kick_dmap_free_worker(fcd, delay_ms); + spin_unlock(&fcd->lock); +} + +static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd) +{ + struct fuse_dax_mapping *dmap; + + spin_lock(&fcd->lock); + dmap = list_first_entry_or_null(&fcd->free_ranges, + struct fuse_dax_mapping, list); + if (dmap) { + list_del_init(&dmap->list); + WARN_ON(fcd->nr_free_ranges <= 0); + fcd->nr_free_ranges--; + } + spin_unlock(&fcd->lock); + + kick_dmap_free_worker(fcd, 0); + return dmap; +} + +/* This assumes fcd->lock is held */ +static void __dmap_remove_busy_list(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + list_del_init(&dmap->busy_list); + WARN_ON(fcd->nr_busy_ranges == 0); + fcd->nr_busy_ranges--; +} + +static void dmap_remove_busy_list(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + spin_lock(&fcd->lock); + __dmap_remove_busy_list(fcd, dmap); + spin_unlock(&fcd->lock); +} + +/* This assumes fcd->lock is held */ +static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + list_add_tail(&dmap->list, &fcd->free_ranges); + fcd->nr_free_ranges++; + wake_up(&fcd->range_waitq); +} + +static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + /* Return fuse_dax_mapping to free list */ + spin_lock(&fcd->lock); + __dmap_add_to_free_pool(fcd, dmap); + spin_unlock(&fcd->lock); +} + +static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx, + struct fuse_dax_mapping *dmap, bool writable, + bool upgrade) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn_dax *fcd = fm->fc->dax; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_setupmapping_in inarg; + loff_t offset = start_idx << FUSE_DAX_SHIFT; + FUSE_ARGS(args); + ssize_t err; + + WARN_ON(fcd->nr_free_ranges < 0); + + /* Ask fuse daemon to setup mapping */ + memset(&inarg, 0, sizeof(inarg)); + inarg.foffset = offset; + inarg.fh = -1; + inarg.moffset = dmap->window_offset; + inarg.len = FUSE_DAX_SZ; + inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ; + if (writable) + inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE; + args.opcode = FUSE_SETUPMAPPING; + args.nodeid = fi->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + err = fuse_simple_request(fm, &args); + if (err < 0) + return err; + dmap->writable = writable; + if (!upgrade) { + /* + * We don't take a refernce on inode. inode is valid right now + * and when inode is going away, cleanup logic should first + * cleanup dmap entries. + */ + dmap->inode = inode; + dmap->itn.start = dmap->itn.last = start_idx; + /* Protected by fi->dax->sem */ + interval_tree_insert(&dmap->itn, &fi->dax->tree); + fi->dax->nr++; + spin_lock(&fcd->lock); + list_add_tail(&dmap->busy_list, &fcd->busy_ranges); + fcd->nr_busy_ranges++; + spin_unlock(&fcd->lock); + } + return 0; +} + +static int fuse_send_removemapping(struct inode *inode, + struct fuse_removemapping_in *inargp, + struct fuse_removemapping_one *remove_one) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_mount *fm = get_fuse_mount(inode); + FUSE_ARGS(args); + + args.opcode = FUSE_REMOVEMAPPING; + args.nodeid = fi->nodeid; + args.in_numargs = 2; + args.in_args[0].size = sizeof(*inargp); + args.in_args[0].value = inargp; + args.in_args[1].size = inargp->count * sizeof(*remove_one); + args.in_args[1].value = remove_one; + return fuse_simple_request(fm, &args); +} + +static int dmap_removemapping_list(struct inode *inode, unsigned int num, + struct list_head *to_remove) +{ + struct fuse_removemapping_one *remove_one, *ptr; + struct fuse_removemapping_in inarg; + struct fuse_dax_mapping *dmap; + int ret, i = 0, nr_alloc; + + nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY); + remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS); + if (!remove_one) + return -ENOMEM; + + ptr = remove_one; + list_for_each_entry(dmap, to_remove, list) { + ptr->moffset = dmap->window_offset; + ptr->len = dmap->length; + ptr++; + i++; + num--; + if (i >= nr_alloc || num == 0) { + memset(&inarg, 0, sizeof(inarg)); + inarg.count = i; + ret = fuse_send_removemapping(inode, &inarg, + remove_one); + if (ret) + goto out; + ptr = remove_one; + i = 0; + } + } +out: + kfree(remove_one); + return ret; +} + +/* + * Cleanup dmap entry and add back to free list. This should be called with + * fcd->lock held. + */ +static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n", + dmap->itn.start, dmap->itn.last, dmap->window_offset, + dmap->length); + __dmap_remove_busy_list(fcd, dmap); + dmap->inode = NULL; + dmap->itn.start = dmap->itn.last = 0; + __dmap_add_to_free_pool(fcd, dmap); +} + +/* + * Free inode dmap entries whose range falls inside [start, end]. + * Does not take any locks. At this point of time it should only be + * called from evict_inode() path where we know all dmap entries can be + * reclaimed. + */ +static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd, + struct inode *inode, + loff_t start, loff_t end) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap, *n; + int err, num = 0; + LIST_HEAD(to_remove); + unsigned long start_idx = start >> FUSE_DAX_SHIFT; + unsigned long end_idx = end >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + while (1) { + node = interval_tree_iter_first(&fi->dax->tree, start_idx, + end_idx); + if (!node) + break; + dmap = node_to_dmap(node); + /* inode is going away. There should not be any users of dmap */ + WARN_ON(refcount_read(&dmap->refcnt) > 1); + interval_tree_remove(&dmap->itn, &fi->dax->tree); + num++; + list_add(&dmap->list, &to_remove); + } + + /* Nothing to remove */ + if (list_empty(&to_remove)) + return; + + WARN_ON(fi->dax->nr < num); + fi->dax->nr -= num; + err = dmap_removemapping_list(inode, num, &to_remove); + if (err && err != -ENOTCONN) { + pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n", + start, end); + } + spin_lock(&fcd->lock); + list_for_each_entry_safe(dmap, n, &to_remove, list) { + list_del_init(&dmap->list); + dmap_reinit_add_to_free_pool(fcd, dmap); + } + spin_unlock(&fcd->lock); +} + +static int dmap_removemapping_one(struct inode *inode, + struct fuse_dax_mapping *dmap) +{ + struct fuse_removemapping_one forget_one; + struct fuse_removemapping_in inarg; + + memset(&inarg, 0, sizeof(inarg)); + inarg.count = 1; + memset(&forget_one, 0, sizeof(forget_one)); + forget_one.moffset = dmap->window_offset; + forget_one.len = dmap->length; + + return fuse_send_removemapping(inode, &inarg, &forget_one); +} + +/* + * It is called from evict_inode() and by that time inode is going away. So + * this function does not take any locks like fi->dax->sem for traversing + * that fuse inode interval tree. If that lock is taken then lock validator + * complains of deadlock situation w.r.t fs_reclaim lock. + */ +void fuse_dax_inode_cleanup(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + /* + * fuse_evict_inode() has already called truncate_inode_pages_final() + * before we arrive here. So we should not have to worry about any + * pages/exception entries still associated with inode. + */ + inode_reclaim_dmap_range(fc->dax, inode, 0, -1); + WARN_ON(fi->dax->nr); +} + +static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length) +{ + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = length; + iomap->type = IOMAP_HOLE; +} + +static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap, struct fuse_dax_mapping *dmap, + unsigned int flags) +{ + loff_t offset, len; + loff_t i_size = i_size_read(inode); + + offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT); + len = min(length, dmap->length - offset); + + /* If length is beyond end of file, truncate further */ + if (pos + len > i_size) + len = i_size - pos; + + if (len > 0) { + iomap->addr = dmap->window_offset + offset; + iomap->length = len; + if (flags & IOMAP_FAULT) + iomap->length = ALIGN(len, PAGE_SIZE); + iomap->type = IOMAP_MAPPED; + /* + * increace refcnt so that reclaim code knows this dmap is in + * use. This assumes fi->dax->sem mutex is held either + * shared/exclusive. + */ + refcount_inc(&dmap->refcnt); + + /* iomap->private should be NULL */ + WARN_ON_ONCE(iomap->private); + iomap->private = dmap; + } else { + /* Mapping beyond end of file is hole */ + fuse_fill_iomap_hole(iomap, length); + } +} + +static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos, + loff_t length, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_conn_dax *fcd = fc->dax; + struct fuse_dax_mapping *dmap, *alloc_dmap = NULL; + int ret; + bool writable = flags & IOMAP_WRITE; + unsigned long start_idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* + * Can't do inline reclaim in fault path. We call + * dax_layout_busy_page() before we free a range. And + * fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it. + * In fault path we enter with fi->i_mmap_sem held and can't drop + * it. Also in fault path we hold fi->i_mmap_sem shared and not + * exclusive, so that creates further issues with fuse_wait_dax_page(). + * Hence return -EAGAIN and fuse_dax_fault() will wait for a memory + * range to become free and retry. + */ + if (flags & IOMAP_FAULT) { + alloc_dmap = alloc_dax_mapping(fcd); + if (!alloc_dmap) + return -EAGAIN; + } else { + alloc_dmap = alloc_dax_mapping_reclaim(fcd, inode); + if (IS_ERR(alloc_dmap)) + return PTR_ERR(alloc_dmap); + } + + /* If we are here, we should have memory allocated */ + if (WARN_ON(!alloc_dmap)) + return -EIO; + + /* + * Take write lock so that only one caller can try to setup mapping + * and other waits. + */ + down_write(&fi->dax->sem); + /* + * We dropped lock. Check again if somebody else setup + * mapping already. + */ + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + if (node) { + dmap = node_to_dmap(node); + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); + dmap_add_to_free_pool(fcd, alloc_dmap); + up_write(&fi->dax->sem); + return 0; + } + + /* Setup one mapping */ + ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap, + writable, false); + if (ret < 0) { + dmap_add_to_free_pool(fcd, alloc_dmap); + up_write(&fi->dax->sem); + return ret; + } + fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags); + up_write(&fi->dax->sem); + return 0; +} + +static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos, + loff_t length, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + int ret; + unsigned long idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* + * Take exclusive lock so that only one caller can try to setup + * mapping and others wait. + */ + down_write(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, idx, idx); + + /* We are holding either inode lock or i_mmap_sem, and that should + * ensure that dmap can't be truncated. We are holding a reference + * on dmap and that should make sure it can't be reclaimed. So dmap + * should still be there in tree despite the fact we dropped and + * re-acquired the fi->dax->sem lock. + */ + ret = -EIO; + if (WARN_ON(!node)) + goto out_err; + + dmap = node_to_dmap(node); + + /* We took an extra reference on dmap to make sure its not reclaimd. + * Now we hold fi->dax->sem lock and that reference is not needed + * anymore. Drop it. + */ + if (refcount_dec_and_test(&dmap->refcnt)) { + /* refcount should not hit 0. This object only goes + * away when fuse connection goes away + */ + WARN_ON_ONCE(1); + } + + /* Maybe another thread already upgraded mapping while we were not + * holding lock. + */ + if (dmap->writable) { + ret = 0; + goto out_fill_iomap; + } + + ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true, + true); + if (ret < 0) + goto out_err; +out_fill_iomap: + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); +out_err: + up_write(&fi->dax->sem); + return ret; +} + +/* This is just for DAX and the mapping is ephemeral, do not use it for other + * purposes since there is no block device with a permanent mapping. + */ +static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_dax_mapping *dmap; + bool writable = flags & IOMAP_WRITE; + unsigned long start_idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* We don't support FIEMAP */ + if (WARN_ON(flags & IOMAP_REPORT)) + return -EIO; + + iomap->offset = pos; + iomap->flags = 0; + iomap->bdev = NULL; + iomap->dax_dev = fc->dax->dev; + + /* + * Both read/write and mmap path can race here. So we need something + * to make sure if we are setting up mapping, then other path waits + * + * For now, use a semaphore for this. It probably needs to be + * optimized later. + */ + down_read(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + if (node) { + dmap = node_to_dmap(node); + if (writable && !dmap->writable) { + /* Upgrade read-only mapping to read-write. This will + * require exclusive fi->dax->sem lock as we don't want + * two threads to be trying to this simultaneously + * for same dmap. So drop shared lock and acquire + * exclusive lock. + * + * Before dropping fi->dax->sem lock, take reference + * on dmap so that its not freed by range reclaim. + */ + refcount_inc(&dmap->refcnt); + up_read(&fi->dax->sem); + pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n", + __func__, pos, length); + return fuse_upgrade_dax_mapping(inode, pos, length, + flags, iomap); + } else { + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); + up_read(&fi->dax->sem); + return 0; + } + } else { + up_read(&fi->dax->sem); + pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n", + __func__, pos, length); + if (pos >= i_size_read(inode)) + goto iomap_hole; + + return fuse_setup_new_dax_mapping(inode, pos, length, flags, + iomap); + } + + /* + * If read beyond end of file happnes, fs code seems to return + * it as hole + */ +iomap_hole: + fuse_fill_iomap_hole(iomap, length); + pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", + __func__, pos, length, iomap->length); + return 0; +} + +static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_dax_mapping *dmap = iomap->private; + + if (dmap) { + if (refcount_dec_and_test(&dmap->refcnt)) { + /* refcount should not hit 0. This object only goes + * away when fuse connection goes away + */ + WARN_ON_ONCE(1); + } + } + + /* DAX writes beyond end-of-file aren't handled using iomap, so the + * file size is unchanged and there is nothing to do here. + */ + return 0; +} + +static const struct iomap_ops fuse_iomap_ops = { + .iomap_begin = fuse_iomap_begin, + .iomap_end = fuse_iomap_end, +}; + +static void fuse_wait_dax_page(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + up_write(&fi->i_mmap_sem); + schedule(); + down_write(&fi->i_mmap_sem); +} + +/* Should be called with fi->i_mmap_sem lock held exclusively */ +static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, + loff_t start, loff_t end) +{ + struct page *page; + + page = dax_layout_busy_page_range(inode->i_mapping, start, end); + if (!page) + return 0; + + *retry = true; + return ___wait_var_event(&page->_refcount, + atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, + 0, 0, fuse_wait_dax_page(inode)); +} + +/* dmap_end == 0 leads to unmapping of whole file */ +int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, + u64 dmap_end) +{ + bool retry; + int ret; + + do { + retry = false; + ret = __fuse_dax_break_layouts(inode, &retry, dmap_start, + dmap_end); + } while (ret == 0 && retry); + + return ret; +} + +ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops); + inode_unlock_shared(inode); + + /* TODO file_accessed(iocb->f_filp) */ + return ret; +} + +static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return (iov_iter_rw(from) == WRITE && + ((iocb->ki_pos) >= i_size_read(inode) || + (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode)))); +} + +static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + ssize_t ret; + + ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); + if (ret < 0) + return ret; + + fuse_invalidate_attr(inode); + fuse_write_update_size(inode, iocb->ki_pos); + return ret; +} + +ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + ret = file_remove_privs(iocb->ki_filp); + if (ret) + goto out; + /* TODO file_update_time() but we don't want metadata I/O */ + + /* Do not use dax for file extending writes as write and on + * disk i_size increase are not atomic otherwise. + */ + if (file_extending_write(iocb, from)) + ret = fuse_dax_direct_write(iocb, from); + else + ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops); + +out: + inode_unlock(inode); + + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} + +static int fuse_dax_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + + return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc); +} + +static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, + enum page_entry_size pe_size, bool write) +{ + vm_fault_t ret; + struct inode *inode = file_inode(vmf->vma->vm_file); + struct super_block *sb = inode->i_sb; + pfn_t pfn; + int error = 0; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_conn_dax *fcd = fc->dax; + bool retry = false; + + if (write) + sb_start_pagefault(sb); +retry: + if (retry && !(fcd->nr_free_ranges > 0)) + wait_event(fcd->range_waitq, (fcd->nr_free_ranges > 0)); + + /* + * We need to serialize against not only truncate but also against + * fuse dax memory range reclaim. While a range is being reclaimed, + * we do not want any read/write/mmap to make progress and try + * to populate page cache or access memory we are trying to free. + */ + down_read(&get_fuse_inode(inode)->i_mmap_sem); + ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); + if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) { + error = 0; + retry = true; + up_read(&get_fuse_inode(inode)->i_mmap_sem); + goto retry; + } + + if (ret & VM_FAULT_NEEDDSYNC) + ret = dax_finish_sync_fault(vmf, pe_size, pfn); + up_read(&get_fuse_inode(inode)->i_mmap_sem); + + if (write) + sb_end_pagefault(sb); + + return ret; +} + +static vm_fault_t fuse_dax_fault(struct vm_fault *vmf) +{ + return __fuse_dax_fault(vmf, PE_SIZE_PTE, + vmf->flags & FAULT_FLAG_WRITE); +} + +static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, + enum page_entry_size pe_size) +{ + return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE); +} + +static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf) +{ + return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); +} + +static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf) +{ + return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); +} + +static const struct vm_operations_struct fuse_dax_vm_ops = { + .fault = fuse_dax_fault, + .huge_fault = fuse_dax_huge_fault, + .page_mkwrite = fuse_dax_page_mkwrite, + .pfn_mkwrite = fuse_dax_pfn_mkwrite, +}; + +int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_ops = &fuse_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + return 0; +} + +static int dmap_writeback_invalidate(struct inode *inode, + struct fuse_dax_mapping *dmap) +{ + int ret; + loff_t start_pos = dmap->itn.start << FUSE_DAX_SHIFT; + loff_t end_pos = (start_pos + FUSE_DAX_SZ - 1); + + ret = filemap_fdatawrite_range(inode->i_mapping, start_pos, end_pos); + if (ret) { + pr_debug("fuse: filemap_fdatawrite_range() failed. err=%d start_pos=0x%llx, end_pos=0x%llx\n", + ret, start_pos, end_pos); + return ret; + } + + ret = invalidate_inode_pages2_range(inode->i_mapping, + start_pos >> PAGE_SHIFT, + end_pos >> PAGE_SHIFT); + if (ret) + pr_debug("fuse: invalidate_inode_pages2_range() failed err=%d\n", + ret); + + return ret; +} + +static int reclaim_one_dmap_locked(struct inode *inode, + struct fuse_dax_mapping *dmap) +{ + int ret; + struct fuse_inode *fi = get_fuse_inode(inode); + + /* + * igrab() was done to make sure inode won't go under us, and this + * further avoids the race with evict(). + */ + ret = dmap_writeback_invalidate(inode, dmap); + if (ret) + return ret; + + /* Remove dax mapping from inode interval tree now */ + interval_tree_remove(&dmap->itn, &fi->dax->tree); + fi->dax->nr--; + + /* It is possible that umount/shutdown has killed the fuse connection + * and worker thread is trying to reclaim memory in parallel. Don't + * warn in that case. + */ + ret = dmap_removemapping_one(inode, dmap); + if (ret && ret != -ENOTCONN) { + pr_warn("Failed to remove mapping. offset=0x%llx len=0x%llx ret=%d\n", + dmap->window_offset, dmap->length, ret); + } + return 0; +} + +/* Find first mapped dmap for an inode and return file offset. Caller needs + * to hold fi->dax->sem lock either shared or exclusive. + */ +static struct fuse_dax_mapping *inode_lookup_first_dmap(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + struct interval_tree_node *node; + + for (node = interval_tree_iter_first(&fi->dax->tree, 0, -1); node; + node = interval_tree_iter_next(node, 0, -1)) { + dmap = node_to_dmap(node); + /* still in use. */ + if (refcount_read(&dmap->refcnt) > 1) + continue; + + return dmap; + } + + return NULL; +} + +/* + * Find first mapping in the tree and free it and return it. Do not add + * it back to free pool. + */ +static struct fuse_dax_mapping * +inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode, + bool *retry) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + u64 dmap_start, dmap_end; + unsigned long start_idx; + int ret; + struct interval_tree_node *node; + + down_write(&fi->i_mmap_sem); + + /* Lookup a dmap and corresponding file offset to reclaim. */ + down_read(&fi->dax->sem); + dmap = inode_lookup_first_dmap(inode); + if (dmap) { + start_idx = dmap->itn.start; + dmap_start = start_idx << FUSE_DAX_SHIFT; + dmap_end = dmap_start + FUSE_DAX_SZ - 1; + } + up_read(&fi->dax->sem); + + if (!dmap) + goto out_mmap_sem; + /* + * Make sure there are no references to inode pages using + * get_user_pages() + */ + ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); + if (ret) { + pr_debug("fuse: fuse_dax_break_layouts() failed. err=%d\n", + ret); + dmap = ERR_PTR(ret); + goto out_mmap_sem; + } + + down_write(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + /* Range already got reclaimed by somebody else */ + if (!node) { + if (retry) + *retry = true; + goto out_write_dmap_sem; + } + + dmap = node_to_dmap(node); + /* still in use. */ + if (refcount_read(&dmap->refcnt) > 1) { + dmap = NULL; + if (retry) + *retry = true; + goto out_write_dmap_sem; + } + + ret = reclaim_one_dmap_locked(inode, dmap); + if (ret < 0) { + dmap = ERR_PTR(ret); + goto out_write_dmap_sem; + } + + /* Clean up dmap. Do not add back to free list */ + dmap_remove_busy_list(fcd, dmap); + dmap->inode = NULL; + dmap->itn.start = dmap->itn.last = 0; + + pr_debug("fuse: %s: inline reclaimed memory range. inode=%p, window_offset=0x%llx, length=0x%llx\n", + __func__, inode, dmap->window_offset, dmap->length); + +out_write_dmap_sem: + up_write(&fi->dax->sem); +out_mmap_sem: + up_write(&fi->i_mmap_sem); + return dmap; +} + +static struct fuse_dax_mapping * +alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode) +{ + struct fuse_dax_mapping *dmap; + struct fuse_inode *fi = get_fuse_inode(inode); + + while (1) { + bool retry = false; + + dmap = alloc_dax_mapping(fcd); + if (dmap) + return dmap; + + dmap = inode_inline_reclaim_one_dmap(fcd, inode, &retry); + /* + * Either we got a mapping or it is an error, return in both + * the cases. + */ + if (dmap) + return dmap; + + /* If we could not reclaim a mapping because it + * had a reference or some other temporary failure, + * Try again. We want to give up inline reclaim only + * if there is no range assigned to this node. Otherwise + * if a deadlock is possible if we sleep with fi->i_mmap_sem + * held and worker to free memory can't make progress due + * to unavailability of fi->i_mmap_sem lock. So sleep + * only if fi->dax->nr=0 + */ + if (retry) + continue; + /* + * There are no mappings which can be reclaimed. Wait for one. + * We are not holding fi->dax->sem. So it is possible + * that range gets added now. But as we are not holding + * fi->i_mmap_sem, worker should still be able to free up + * a range and wake us up. + */ + if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) { + if (wait_event_killable_exclusive(fcd->range_waitq, + (fcd->nr_free_ranges > 0))) { + return ERR_PTR(-EINTR); + } + } + } +} + +static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd, + struct inode *inode, + unsigned long start_idx) +{ + int ret; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + struct interval_tree_node *node; + + /* Find fuse dax mapping at file offset inode. */ + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + + /* Range already got cleaned up by somebody else */ + if (!node) + return 0; + dmap = node_to_dmap(node); + + /* still in use. */ + if (refcount_read(&dmap->refcnt) > 1) + return 0; + + ret = reclaim_one_dmap_locked(inode, dmap); + if (ret < 0) + return ret; + + /* Cleanup dmap entry and add back to free list */ + spin_lock(&fcd->lock); + dmap_reinit_add_to_free_pool(fcd, dmap); + spin_unlock(&fcd->lock); + return ret; +} + +/* + * Free a range of memory. + * Locking: + * 1. Take fi->i_mmap_sem to block dax faults. + * 2. Take fi->dax->sem to protect interval tree and also to make sure + * read/write can not reuse a dmap which we might be freeing. + */ +static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd, + struct inode *inode, + unsigned long start_idx, + unsigned long end_idx) +{ + int ret; + struct fuse_inode *fi = get_fuse_inode(inode); + loff_t dmap_start = start_idx << FUSE_DAX_SHIFT; + loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1; + + down_write(&fi->i_mmap_sem); + ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); + if (ret) { + pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n", + ret); + goto out_mmap_sem; + } + + down_write(&fi->dax->sem); + ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx); + up_write(&fi->dax->sem); +out_mmap_sem: + up_write(&fi->i_mmap_sem); + return ret; +} + +static int try_to_free_dmap_chunks(struct fuse_conn_dax *fcd, + unsigned long nr_to_free) +{ + struct fuse_dax_mapping *dmap, *pos, *temp; + int ret, nr_freed = 0; + unsigned long start_idx = 0, end_idx = 0; + struct inode *inode = NULL; + + /* Pick first busy range and free it for now*/ + while (1) { + if (nr_freed >= nr_to_free) + break; + + dmap = NULL; + spin_lock(&fcd->lock); + + if (!fcd->nr_busy_ranges) { + spin_unlock(&fcd->lock); + return 0; + } + + list_for_each_entry_safe(pos, temp, &fcd->busy_ranges, + busy_list) { + /* skip this range if it's in use. */ + if (refcount_read(&pos->refcnt) > 1) + continue; + + inode = igrab(pos->inode); + /* + * This inode is going away. That will free + * up all the ranges anyway, continue to + * next range. + */ + if (!inode) + continue; + /* + * Take this element off list and add it tail. If + * this element can't be freed, it will help with + * selecting new element in next iteration of loop. + */ + dmap = pos; + list_move_tail(&dmap->busy_list, &fcd->busy_ranges); + start_idx = end_idx = dmap->itn.start; + break; + } + spin_unlock(&fcd->lock); + if (!dmap) + return 0; + + ret = lookup_and_reclaim_dmap(fcd, inode, start_idx, end_idx); + iput(inode); + if (ret) + return ret; + nr_freed++; + } + return 0; +} + +static void fuse_dax_free_mem_worker(struct work_struct *work) +{ + int ret; + struct fuse_conn_dax *fcd = container_of(work, struct fuse_conn_dax, + free_work.work); + ret = try_to_free_dmap_chunks(fcd, FUSE_DAX_RECLAIM_CHUNK); + if (ret) { + pr_debug("fuse: try_to_free_dmap_chunks() failed with err=%d\n", + ret); + } + + /* If number of free ranges are still below threhold, requeue */ + kick_dmap_free_worker(fcd, 1); +} + +static void fuse_free_dax_mem_ranges(struct list_head *mem_list) +{ + struct fuse_dax_mapping *range, *temp; + + /* Free All allocated elements */ + list_for_each_entry_safe(range, temp, mem_list, list) { + list_del(&range->list); + if (!list_empty(&range->busy_list)) + list_del(&range->busy_list); + kfree(range); + } +} + +void fuse_dax_conn_free(struct fuse_conn *fc) +{ + if (fc->dax) { + fuse_free_dax_mem_ranges(&fc->dax->free_ranges); + kfree(fc->dax); + } +} + +static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd) +{ + long nr_pages, nr_ranges; + void *kaddr; + pfn_t pfn; + struct fuse_dax_mapping *range; + int ret, id; + size_t dax_size = -1; + unsigned long i; + + init_waitqueue_head(&fcd->range_waitq); + INIT_LIST_HEAD(&fcd->free_ranges); + INIT_LIST_HEAD(&fcd->busy_ranges); + INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker); + + id = dax_read_lock(); + nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), &kaddr, + &pfn); + dax_read_unlock(id); + if (nr_pages < 0) { + pr_debug("dax_direct_access() returned %ld\n", nr_pages); + return nr_pages; + } + + nr_ranges = nr_pages/FUSE_DAX_PAGES; + pr_debug("%s: dax mapped %ld pages. nr_ranges=%ld\n", + __func__, nr_pages, nr_ranges); + + for (i = 0; i < nr_ranges; i++) { + range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL); + ret = -ENOMEM; + if (!range) + goto out_err; + + /* TODO: This offset only works if virtio-fs driver is not + * having some memory hidden at the beginning. This needs + * better handling + */ + range->window_offset = i * FUSE_DAX_SZ; + range->length = FUSE_DAX_SZ; + INIT_LIST_HEAD(&range->busy_list); + refcount_set(&range->refcnt, 1); + list_add_tail(&range->list, &fcd->free_ranges); + } + + fcd->nr_free_ranges = nr_ranges; + fcd->nr_ranges = nr_ranges; + return 0; +out_err: + /* Free All allocated elements */ + fuse_free_dax_mem_ranges(&fcd->free_ranges); + return ret; +} + +int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev) +{ + struct fuse_conn_dax *fcd; + int err; + + if (!dax_dev) + return 0; + + fcd = kzalloc(sizeof(*fcd), GFP_KERNEL); + if (!fcd) + return -ENOMEM; + + spin_lock_init(&fcd->lock); + fcd->dev = dax_dev; + err = fuse_dax_mem_range_init(fcd); + if (err) { + kfree(fcd); + return err; + } + + fc->dax = fcd; + return 0; +} + +bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + fi->dax = NULL; + if (fc->dax) { + fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT); + if (!fi->dax) + return false; + + init_rwsem(&fi->dax->sem); + fi->dax->tree = RB_ROOT_CACHED; + } + + return true; +} + +static const struct address_space_operations fuse_dax_file_aops = { + .writepages = fuse_dax_writepages, + .direct_IO = noop_direct_IO, + .set_page_dirty = noop_set_page_dirty, + .invalidatepage = noop_invalidatepage, +}; + +void fuse_dax_inode_init(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fc->dax) + return; + + inode->i_flags |= S_DAX; + inode->i_data.a_ops = &fuse_dax_file_aops; +} + +bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment) +{ + if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) { + pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n", + map_alignment, FUSE_DAX_SZ); + return false; + } + return true; +} + +void fuse_dax_cancel_work(struct fuse_conn *fc) +{ + struct fuse_conn_dax *fcd = fc->dax; + + if (fcd) + cancel_delayed_work_sync(&fcd->free_work); + +} +EXPORT_SYMBOL_GPL(fuse_dax_cancel_work); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 02b3c36b3676..588f8d1240aa 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -40,20 +40,21 @@ static struct fuse_dev *fuse_get_dev(struct file *file) return READ_ONCE(file->private_data); } -static void fuse_request_init(struct fuse_req *req) +static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) { INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); refcount_set(&req->count, 1); __set_bit(FR_PENDING, &req->flags); + req->fm = fm; } -static struct fuse_req *fuse_request_alloc(gfp_t flags) +static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags) { struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); if (req) - fuse_request_init(req); + fuse_request_init(fm, req); return req; } @@ -100,10 +101,11 @@ static void fuse_drop_waiting(struct fuse_conn *fc) } } -static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); +static void fuse_put_request(struct fuse_req *req); -static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) +static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background) { + struct fuse_conn *fc = fm->fc; struct fuse_req *req; int err; atomic_inc(&fc->num_waiting); @@ -125,7 +127,7 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) if (fc->conn_error) goto out; - req = fuse_request_alloc(GFP_KERNEL); + req = fuse_request_alloc(fm, GFP_KERNEL); err = -ENOMEM; if (!req) { if (for_background) @@ -143,7 +145,7 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) if (unlikely(req->in.h.uid == ((uid_t)-1) || req->in.h.gid == ((gid_t)-1))) { - fuse_put_request(fc, req); + fuse_put_request(req); return ERR_PTR(-EOVERFLOW); } return req; @@ -153,8 +155,10 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) return ERR_PTR(err); } -static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_put_request(struct fuse_req *req) { + struct fuse_conn *fc = req->fm->fc; + if (refcount_dec_and_test(&req->count)) { if (test_bit(FR_BACKGROUND, &req->flags)) { /* @@ -273,8 +277,10 @@ static void flush_bg_queue(struct fuse_conn *fc) * the 'end' callback is called if given, else the reference to the * request is released */ -void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_end(struct fuse_req *req) { + struct fuse_mount *fm = req->fm; + struct fuse_conn *fc = fm->fc; struct fuse_iqueue *fiq = &fc->iq; if (test_and_set_bit(FR_FINISHED, &req->flags)) @@ -309,9 +315,9 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&fc->blocked_waitq); } - if (fc->num_background == fc->congestion_threshold && fc->sb) { - clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + if (fc->num_background == fc->congestion_threshold && fm->sb) { + clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); + clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); } fc->num_background--; fc->active_background--; @@ -323,14 +329,16 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) } if (test_bit(FR_ASYNC, &req->flags)) - req->args->end(fc, req->args, req->out.h.error); + req->args->end(fm, req->args, req->out.h.error); put_request: - fuse_put_request(fc, req); + fuse_put_request(req); } EXPORT_SYMBOL_GPL(fuse_request_end); -static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) +static int queue_interrupt(struct fuse_req *req) { + struct fuse_iqueue *fiq = &req->fm->fc->iq; + spin_lock(&fiq->lock); /* Check for we've sent request to interrupt this req */ if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { @@ -357,8 +365,9 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) return 0; } -static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) +static void request_wait_answer(struct fuse_req *req) { + struct fuse_conn *fc = req->fm->fc; struct fuse_iqueue *fiq = &fc->iq; int err; @@ -373,7 +382,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) /* matches barrier in fuse_dev_do_read() */ smp_mb__after_atomic(); if (test_bit(FR_SENT, &req->flags)) - queue_interrupt(fiq, req); + queue_interrupt(req); } if (!test_bit(FR_FORCE, &req->flags)) { @@ -402,9 +411,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags)); } -static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +static void __fuse_request_send(struct fuse_req *req) { - struct fuse_iqueue *fiq = &fc->iq; + struct fuse_iqueue *fiq = &req->fm->fc->iq; BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); spin_lock(&fiq->lock); @@ -418,7 +427,7 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) __fuse_get_request(req); queue_request_and_unlock(fiq, req); - request_wait_answer(fc, req); + request_wait_answer(req); /* Pairs with smp_wmb() in fuse_request_end() */ smp_rmb(); } @@ -457,8 +466,10 @@ static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) } } -static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_force_creds(struct fuse_req *req) { + struct fuse_conn *fc = req->fm->fc; + req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); @@ -473,23 +484,24 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) __set_bit(FR_ASYNC, &req->flags); } -ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) +ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) { + struct fuse_conn *fc = fm->fc; struct fuse_req *req; ssize_t ret; if (args->force) { atomic_inc(&fc->num_waiting); - req = fuse_request_alloc(GFP_KERNEL | __GFP_NOFAIL); + req = fuse_request_alloc(fm, GFP_KERNEL | __GFP_NOFAIL); if (!args->nocreds) - fuse_force_creds(fc, req); + fuse_force_creds(req); __set_bit(FR_WAITING, &req->flags); __set_bit(FR_FORCE, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(fc, false); + req = fuse_get_req(fm, false); if (IS_ERR(req)) return PTR_ERR(req); } @@ -500,20 +512,21 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) if (!args->noreply) __set_bit(FR_ISREPLY, &req->flags); - __fuse_request_send(fc, req); + __fuse_request_send(req); ret = req->out.h.error; if (!ret && args->out_argvar) { BUG_ON(args->out_numargs == 0); ret = args->out_args[args->out_numargs - 1].size; } - fuse_put_request(fc, req); + fuse_put_request(req); return ret; } -static bool fuse_request_queue_background(struct fuse_conn *fc, - struct fuse_req *req) +static bool fuse_request_queue_background(struct fuse_req *req) { + struct fuse_mount *fm = req->fm; + struct fuse_conn *fc = fm->fc; bool queued = false; WARN_ON(!test_bit(FR_BACKGROUND, &req->flags)); @@ -527,9 +540,9 @@ static bool fuse_request_queue_background(struct fuse_conn *fc, fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - if (fc->num_background == fc->congestion_threshold && fc->sb) { - set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + if (fc->num_background == fc->congestion_threshold && fm->sb) { + set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); + set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); } list_add_tail(&req->list, &fc->bg_queue); flush_bg_queue(fc); @@ -540,28 +553,28 @@ static bool fuse_request_queue_background(struct fuse_conn *fc, return queued; } -int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, +int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags) { struct fuse_req *req; if (args->force) { WARN_ON(!args->nocreds); - req = fuse_request_alloc(gfp_flags); + req = fuse_request_alloc(fm, gfp_flags); if (!req) return -ENOMEM; __set_bit(FR_BACKGROUND, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(fc, true); + req = fuse_get_req(fm, true); if (IS_ERR(req)) return PTR_ERR(req); } fuse_args_to_req(req, args); - if (!fuse_request_queue_background(fc, req)) { - fuse_put_request(fc, req); + if (!fuse_request_queue_background(req)) { + fuse_put_request(req); return -ENOTCONN; } @@ -569,14 +582,14 @@ int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, } EXPORT_SYMBOL_GPL(fuse_simple_background); -static int fuse_simple_notify_reply(struct fuse_conn *fc, +static int fuse_simple_notify_reply(struct fuse_mount *fm, struct fuse_args *args, u64 unique) { struct fuse_req *req; - struct fuse_iqueue *fiq = &fc->iq; + struct fuse_iqueue *fiq = &fm->fc->iq; int err = 0; - req = fuse_get_req(fc, false); + req = fuse_get_req(fm, false); if (IS_ERR(req)) return PTR_ERR(req); @@ -591,7 +604,7 @@ static int fuse_simple_notify_reply(struct fuse_conn *fc, } else { err = -ENODEV; spin_unlock(&fiq->lock); - fuse_put_request(fc, req); + fuse_put_request(req); } return err; @@ -785,15 +798,16 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) struct page *newpage; struct pipe_buffer *buf = cs->pipebufs; + get_page(oldpage); err = unlock_request(cs->req); if (err) - return err; + goto out_put_old; fuse_copy_finish(cs); err = pipe_buf_confirm(cs->pipe, buf); if (err) - return err; + goto out_put_old; BUG_ON(!cs->nr_segs); cs->currbuf = buf; @@ -833,7 +847,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL); if (err) { unlock_page(newpage); - return err; + goto out_put_old; } get_page(newpage); @@ -852,14 +866,19 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (err) { unlock_page(newpage); put_page(newpage); - return err; + goto out_put_old; } unlock_page(oldpage); + /* Drop ref for ap->pages[] array */ put_page(oldpage); cs->len = 0; - return 0; + err = 0; +out_put_old: + /* Drop ref obtained in this function */ + put_page(oldpage); + return err; out_fallback_unlock: unlock_page(newpage); @@ -868,10 +887,10 @@ out_fallback: cs->offset = buf->offset; err = lock_request(cs->req); - if (err) - return err; + if (!err) + err = 1; - return 1; + goto out_put_old; } static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, @@ -883,14 +902,16 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, if (cs->nr_segs >= cs->pipe->max_usage) return -EIO; + get_page(page); err = unlock_request(cs->req); - if (err) + if (err) { + put_page(page); return err; + } fuse_copy_finish(cs); buf = cs->pipebufs; - get_page(page); buf->page = page; buf->offset = offset; buf->len = count; @@ -1250,7 +1271,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, /* SETXATTR is special, since it may contain too large data */ if (args->opcode == FUSE_SETXATTR) req->out.h.error = -E2BIG; - fuse_request_end(fc, req); + fuse_request_end(req); goto restart; } spin_lock(&fpq->lock); @@ -1284,8 +1305,8 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, /* matches barrier in request_wait_answer() */ smp_mb__after_atomic(); if (test_bit(FR_INTERRUPTED, &req->flags)) - queue_interrupt(fiq, req); - fuse_put_request(fc, req); + queue_interrupt(req); + fuse_put_request(req); return reqsize; @@ -1293,7 +1314,7 @@ out_end: if (!test_bit(FR_PRIVATE, &req->flags)) list_del_init(&req->list); spin_unlock(&fpq->lock); - fuse_request_end(fc, req); + fuse_request_end(req); return err; err_unlock: @@ -1416,11 +1437,8 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, fuse_copy_finish(cs); down_read(&fc->killsb); - err = -ENOENT; - if (fc->sb) { - err = fuse_reverse_inval_inode(fc->sb, outarg.ino, - outarg.off, outarg.len); - } + err = fuse_reverse_inval_inode(fc, outarg.ino, + outarg.off, outarg.len); up_read(&fc->killsb); return err; @@ -1466,9 +1484,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = -ENOENT; - if (fc->sb) - err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name); up_read(&fc->killsb); kfree(buf); return err; @@ -1516,10 +1532,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = -ENOENT; - if (fc->sb) - err = fuse_reverse_inval_entry(fc->sb, outarg.parent, - outarg.child, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name); up_read(&fc->killsb); kfree(buf); return err; @@ -1561,10 +1574,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = -ENOENT; - if (!fc->sb) - goto out_up_killsb; - - inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); + inode = fuse_ilookup(fc, nodeid, NULL); if (!inode) goto out_up_killsb; @@ -1621,7 +1631,7 @@ struct fuse_retrieve_args { struct fuse_notify_retrieve_in inarg; }; -static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args, int error) { struct fuse_retrieve_args *ra = @@ -1631,7 +1641,7 @@ static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args, kfree(ra); } -static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, +static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, struct fuse_notify_retrieve_out *outarg) { int err; @@ -1642,6 +1652,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, unsigned int offset; size_t total_len = 0; unsigned int num_pages; + struct fuse_conn *fc = fm->fc; struct fuse_retrieve_args *ra; size_t args_size = sizeof(*ra); struct fuse_args_pages *ap; @@ -1703,9 +1714,9 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, args->in_args[0].value = &ra->inarg; args->in_args[1].size = total_len; - err = fuse_simple_notify_reply(fc, args, outarg->notify_unique); + err = fuse_simple_notify_reply(fm, args, outarg->notify_unique); if (err) - fuse_retrieve_end(fc, args, err); + fuse_retrieve_end(fm, args, err); return err; } @@ -1714,7 +1725,9 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_retrieve_out outarg; + struct fuse_mount *fm; struct inode *inode; + u64 nodeid; int err; err = -EINVAL; @@ -1729,14 +1742,12 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = -ENOENT; - if (fc->sb) { - u64 nodeid = outarg.nodeid; + nodeid = outarg.nodeid; - inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); - if (inode) { - err = fuse_retrieve(fc, inode, &outarg); - iput(inode); - } + inode = fuse_ilookup(fc, nodeid, &fm); + if (inode) { + err = fuse_retrieve(fm, inode, &outarg); + iput(inode); } up_read(&fc->killsb); @@ -1875,9 +1886,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, else if (oh.error == -ENOSYS) fc->no_interrupt = 1; else if (oh.error == -EAGAIN) - err = queue_interrupt(&fc->iq, req); + err = queue_interrupt(req); - fuse_put_request(fc, req); + fuse_put_request(req); goto copy_finish; } @@ -1907,7 +1918,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, list_del_init(&req->list); spin_unlock(&fpq->lock); - fuse_request_end(fc, req); + fuse_request_end(req); out: return err ? err : nbytes; @@ -2045,7 +2056,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) } /* Abort all requests on the given list (pending or processing) */ -static void end_requests(struct fuse_conn *fc, struct list_head *head) +static void end_requests(struct list_head *head) { while (!list_empty(head)) { struct fuse_req *req; @@ -2053,7 +2064,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) req->out.h.error = -ECONNABORTED; clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); - fuse_request_end(fc, req); + fuse_request_end(req); } } @@ -2148,7 +2159,7 @@ void fuse_abort_conn(struct fuse_conn *fc) wake_up_all(&fc->blocked_waitq); spin_unlock(&fc->lock); - end_requests(fc, &to_end); + end_requests(&to_end); } else { spin_unlock(&fc->lock); } @@ -2178,7 +2189,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) list_splice_init(&fpq->processing[i], &to_end); spin_unlock(&fpq->lock); - end_requests(fc, &to_end); + end_requests(&to_end); /* Are we the last open device? */ if (atomic_dec_and_test(&fc->dev_count)) { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 26f028bc760b..ff7dbeb16f88 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/file.h> +#include <linux/fs_context.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/slab.h> @@ -196,7 +197,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) { struct inode *inode; struct dentry *parent; - struct fuse_conn *fc; + struct fuse_mount *fm; struct fuse_inode *fi; int ret; @@ -218,27 +219,29 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (flags & LOOKUP_RCU) goto out; - fc = get_fuse_conn(inode); + fm = get_fuse_mount(inode); forget = fuse_alloc_forget(); ret = -ENOMEM; if (!forget) goto out; - attr_version = fuse_get_attr_version(fc); + attr_version = fuse_get_attr_version(fm->fc); parent = dget_parent(entry); - fuse_lookup_init(fc, &args, get_node_id(d_inode(parent)), + fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)), &entry->d_name, &outarg); - ret = fuse_simple_request(fc, &args); + ret = fuse_simple_request(fm, &args); dput(parent); /* Zero nodeid is same as -ENOENT */ if (!ret && !outarg.nodeid) ret = -ENOENT; if (!ret) { fi = get_fuse_inode(inode); - if (outarg.nodeid != get_node_id(inode)) { - fuse_queue_forget(fc, forget, outarg.nodeid, 1); + if (outarg.nodeid != get_node_id(inode) || + (bool) IS_AUTOMOUNT(inode) != (bool) (outarg.attr.flags & FUSE_ATTR_SUBMOUNT)) { + fuse_queue_forget(fm->fc, forget, + outarg.nodeid, 1); goto invalid; } spin_lock(&fi->lock); @@ -298,6 +301,79 @@ static int fuse_dentry_delete(const struct dentry *dentry) return time_before64(fuse_dentry_time(dentry), get_jiffies_64()); } +/* + * Create a fuse_mount object with a new superblock (with path->dentry + * as the root), and return that mount so it can be auto-mounted on + * @path. + */ +static struct vfsmount *fuse_dentry_automount(struct path *path) +{ + struct fs_context *fsc; + struct fuse_mount *parent_fm = get_fuse_mount_super(path->mnt->mnt_sb); + struct fuse_conn *fc = parent_fm->fc; + struct fuse_mount *fm; + struct vfsmount *mnt; + struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry)); + struct super_block *sb; + int err; + + fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry); + if (IS_ERR(fsc)) { + err = PTR_ERR(fsc); + goto out; + } + + err = -ENOMEM; + fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); + if (!fm) + goto out_put_fsc; + + refcount_set(&fm->count, 1); + fsc->s_fs_info = fm; + sb = sget_fc(fsc, NULL, set_anon_super_fc); + if (IS_ERR(sb)) { + err = PTR_ERR(sb); + fuse_mount_put(fm); + goto out_put_fsc; + } + fm->fc = fuse_conn_get(fc); + + /* Initialize superblock, making @mp_fi its root */ + err = fuse_fill_super_submount(sb, mp_fi); + if (err) + goto out_put_sb; + + sb->s_flags |= SB_ACTIVE; + fsc->root = dget(sb->s_root); + /* We are done configuring the superblock, so unlock it */ + up_write(&sb->s_umount); + + down_write(&fc->killsb); + list_add_tail(&fm->fc_entry, &fc->mounts); + up_write(&fc->killsb); + + /* Create the submount */ + mnt = vfs_create_mount(fsc); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); + goto out_put_fsc; + } + mntget(mnt); + put_fs_context(fsc); + return mnt; + +out_put_sb: + /* + * Only jump here when fsc->root is NULL and sb is still locked + * (otherwise put_fs_context() will put the superblock) + */ + deactivate_locked_super(sb); +out_put_fsc: + put_fs_context(fsc); +out: + return ERR_PTR(err); +} + const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, .d_delete = fuse_dentry_delete, @@ -305,6 +381,7 @@ const struct dentry_operations fuse_dentry_operations = { .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, #endif + .d_automount = fuse_dentry_automount, }; const struct dentry_operations fuse_root_dentry_operations = { @@ -329,7 +406,7 @@ bool fuse_invalid_attr(struct fuse_attr *attr) int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode) { - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); struct fuse_forget_link *forget; u64 attr_version; @@ -346,10 +423,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name if (!forget) goto out; - attr_version = fuse_get_attr_version(fc); + attr_version = fuse_get_attr_version(fm->fc); - fuse_lookup_init(fc, &args, nodeid, name, outarg); - err = fuse_simple_request(fc, &args); + fuse_lookup_init(fm->fc, &args, nodeid, name, outarg); + err = fuse_simple_request(fm, &args); /* Zero nodeid is same as -ENOENT, but with valid timeout */ if (err || !outarg->nodeid) goto out_put_forget; @@ -365,7 +442,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name attr_version); err = -ENOMEM; if (!*inode) { - fuse_queue_forget(fc, forget, outarg->nodeid, 1); + fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1); goto out; } err = 0; @@ -434,7 +511,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, { int err; struct inode *inode; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); struct fuse_forget_link *forget; struct fuse_create_in inarg; @@ -452,11 +529,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, goto out_err; err = -ENOMEM; - ff = fuse_file_alloc(fc); + ff = fuse_file_alloc(fm); if (!ff) goto out_put_forget_req; - if (!fc->dont_mask) + if (!fm->fc->dont_mask) mode &= ~current_umask(); flags &= ~O_NOCTTY; @@ -477,7 +554,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_args[0].value = &outentry; args.out_args[1].size = sizeof(outopen); args.out_args[1].value = &outopen; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err) goto out_free_ff; @@ -494,7 +571,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); fuse_sync_release(NULL, ff, flags); - fuse_queue_forget(fc, forget, outentry.nodeid, 1); + fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); err = -ENOMEM; goto out_err; } @@ -567,7 +644,7 @@ no_open: /* * Code shared between mknod, mkdir, symlink and link */ -static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, +static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, struct inode *dir, struct dentry *entry, umode_t mode) { @@ -586,7 +663,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, args->out_numargs = 1; args->out_args[0].size = sizeof(outarg); args->out_args[0].value = &outarg; - err = fuse_simple_request(fc, args); + err = fuse_simple_request(fm, args); if (err) goto out_put_forget_req; @@ -600,7 +677,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, &outarg.attr, entry_attr_timeout(&outarg), 0); if (!inode) { - fuse_queue_forget(fc, forget, outarg.nodeid, 1); + fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); return -ENOMEM; } kfree(forget); @@ -628,10 +705,10 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev) { struct fuse_mknod_in inarg; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); - if (!fc->dont_mask) + if (!fm->fc->dont_mask) mode &= ~current_umask(); memset(&inarg, 0, sizeof(inarg)); @@ -644,7 +721,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(fc, &args, dir, entry, mode); + return create_new_entry(fm, &args, dir, entry, mode); } static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, @@ -656,10 +733,10 @@ static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); - if (!fc->dont_mask) + if (!fm->fc->dont_mask) mode &= ~current_umask(); memset(&inarg, 0, sizeof(inarg)); @@ -671,13 +748,13 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(fc, &args, dir, entry, S_IFDIR); + return create_new_entry(fm, &args, dir, entry, S_IFDIR); } static int fuse_symlink(struct inode *dir, struct dentry *entry, const char *link) { - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); unsigned len = strlen(link) + 1; FUSE_ARGS(args); @@ -687,7 +764,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, args.in_args[0].value = entry->d_name.name; args.in_args[1].size = len; args.in_args[1].value = link; - return create_new_entry(fc, &args, dir, entry, S_IFLNK); + return create_new_entry(fm, &args, dir, entry, S_IFLNK); } void fuse_update_ctime(struct inode *inode) @@ -701,7 +778,7 @@ void fuse_update_ctime(struct inode *inode) static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); args.opcode = FUSE_UNLINK; @@ -709,13 +786,13 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) args.in_numargs = 1; args.in_args[0].size = entry->d_name.len + 1; args.in_args[0].value = entry->d_name.name; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) { struct inode *inode = d_inode(entry); struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fc->attr_version); + fi->attr_version = atomic64_inc_return(&fm->fc->attr_version); /* * If i_nlink == 0 then unlink doesn't make sense, yet this can * happen if userspace filesystem is careless. It would be @@ -737,7 +814,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); args.opcode = FUSE_RMDIR; @@ -745,7 +822,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) args.in_numargs = 1; args.in_args[0].size = entry->d_name.len + 1; args.in_args[0].value = entry->d_name.name; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) { clear_nlink(d_inode(entry)); fuse_dir_changed(dir); @@ -761,7 +838,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, { int err; struct fuse_rename2_in inarg; - struct fuse_conn *fc = get_fuse_conn(olddir); + struct fuse_mount *fm = get_fuse_mount(olddir); FUSE_ARGS(args); memset(&inarg, 0, argsize); @@ -776,7 +853,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, args.in_args[1].value = oldent->d_name.name; args.in_args[2].size = newent->d_name.len + 1; args.in_args[2].value = newent->d_name.name; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) { /* ctime changes */ fuse_invalidate_attr(d_inode(oldent)); @@ -847,7 +924,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, int err; struct fuse_link_in inarg; struct inode *inode = d_inode(entry); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); memset(&inarg, 0, sizeof(inarg)); @@ -858,7 +935,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, args.in_args[0].value = &inarg; args.in_args[1].size = newent->d_name.len + 1; args.in_args[1].value = newent->d_name.name; - err = create_new_entry(fc, &args, newdir, newent, inode->i_mode); + err = create_new_entry(fm, &args, newdir, newent, inode->i_mode); /* Contrary to "normal" filesystems it can happen that link makes two "logical" inodes point to the same "physical" inode. We invalidate the attributes of the old one, so it @@ -869,7 +946,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fc->attr_version); + fi->attr_version = atomic64_inc_return(&fm->fc->attr_version); if (likely(inode->i_nlink < UINT_MAX)) inc_nlink(inode); spin_unlock(&fi->lock); @@ -926,11 +1003,11 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, int err; struct fuse_getattr_in inarg; struct fuse_attr_out outarg; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); u64 attr_version; - attr_version = fuse_get_attr_version(fc); + attr_version = fuse_get_attr_version(fm->fc); memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); @@ -949,7 +1026,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) { if (fuse_invalid_attr(&outarg.attr) || (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { @@ -1002,7 +1079,7 @@ int fuse_update_attributes(struct inode *inode, struct file *file) STATX_BASIC_STATS & ~STATX_ATIME, 0); } -int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, +int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, u64 child_nodeid, struct qstr *name) { int err = -ENOTDIR; @@ -1010,7 +1087,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, struct dentry *dir; struct dentry *entry; - parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid); + parent = fuse_ilookup(fc, parent_nodeid, NULL); if (!parent) return -ENOENT; @@ -1102,14 +1179,14 @@ int fuse_allow_current_process(struct fuse_conn *fc) static int fuse_access(struct inode *inode, int mask) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_access_in inarg; int err; BUG_ON(mask & MAY_NOT_BLOCK); - if (fc->no_access) + if (fm->fc->no_access) return 0; memset(&inarg, 0, sizeof(inarg)); @@ -1119,9 +1196,9 @@ static int fuse_access(struct inode *inode, int mask) args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_access = 1; + fm->fc->no_access = 1; err = 0; } return err; @@ -1209,7 +1286,7 @@ static int fuse_permission(struct inode *inode, int mask) static int fuse_readlink_page(struct inode *inode, struct page *page) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 }; struct fuse_args_pages ap = { .num_pages = 1, @@ -1226,7 +1303,7 @@ static int fuse_readlink_page(struct inode *inode, struct page *page) ap.args.page_zeroing = true; ap.args.out_numargs = 1; ap.args.out_args[0].size = desc.length; - res = fuse_simple_request(fc, &ap.args); + res = fuse_simple_request(fm, &ap.args); fuse_invalidate_atime(inode); @@ -1454,7 +1531,7 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, */ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; @@ -1465,7 +1542,7 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) inarg.valid = FATTR_MTIME; inarg.mtime = inode->i_mtime.tv_sec; inarg.mtimensec = inode->i_mtime.tv_nsec; - if (fc->minor >= 23) { + if (fm->fc->minor >= 23) { inarg.valid |= FATTR_CTIME; inarg.ctime = inode->i_ctime.tv_sec; inarg.ctimensec = inode->i_ctime.tv_nsec; @@ -1474,9 +1551,9 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) inarg.valid |= FATTR_FH; inarg.fh = ff->fh; } - fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); + fuse_setattr_fill(fm->fc, &args, inode, &inarg, &outarg); - return fuse_simple_request(fc, &args); + return fuse_simple_request(fm, &args); } /* @@ -1491,7 +1568,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, struct file *file) { struct inode *inode = d_inode(dentry); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn *fc = fm->fc; struct fuse_inode *fi = get_fuse_inode(inode); FUSE_ARGS(args); struct fuse_setattr_in inarg; @@ -1501,6 +1579,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, loff_t oldsize; int err; bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); + bool fault_blocked = false; if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; @@ -1509,6 +1588,22 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (err) return err; + if (attr->ia_valid & ATTR_SIZE) { + if (WARN_ON(!S_ISREG(inode->i_mode))) + return -EIO; + is_truncate = true; + } + + if (FUSE_IS_DAX(inode) && is_truncate) { + down_write(&fi->i_mmap_sem); + fault_blocked = true; + err = fuse_dax_break_layouts(inode, 0, 0); + if (err) { + up_write(&fi->i_mmap_sem); + return err; + } + } + if (attr->ia_valid & ATTR_OPEN) { /* This is coming from open(..., ... | O_TRUNC); */ WARN_ON(!(attr->ia_valid & ATTR_SIZE)); @@ -1521,17 +1616,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, */ i_size_write(inode, 0); truncate_pagecache(inode, 0); - return 0; + goto out; } file = NULL; } - if (attr->ia_valid & ATTR_SIZE) { - if (WARN_ON(!S_ISREG(inode->i_mode))) - return -EIO; - is_truncate = true; - } - /* Flush dirty data/metadata before non-truncate SETATTR */ if (is_wb && S_ISREG(inode->i_mode) && attr->ia_valid & @@ -1566,7 +1655,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, inarg.lock_owner = fuse_lock_owner_id(fc, current->files); } fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err) { if (err == -EINTR) fuse_invalidate_attr(inode); @@ -1614,6 +1703,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); +out: + if (fault_blocked) + up_write(&fi->i_mmap_sem); + return 0; error: @@ -1621,6 +1714,9 @@ error: fuse_release_nowrite(inode); clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + + if (fault_blocked) + up_write(&fi->i_mmap_sem); return err; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 43c165e796da..c03034e8c152 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -32,7 +32,7 @@ static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, return pages; } -static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, +static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, struct file *file, int opcode, struct fuse_open_out *outargp) { struct fuse_open_in inarg; @@ -40,7 +40,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, memset(&inarg, 0, sizeof(inarg)); inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); - if (!fc->atomic_o_trunc) + if (!fm->fc->atomic_o_trunc) inarg.flags &= ~O_TRUNC; args.opcode = opcode; args.nodeid = nodeid; @@ -51,7 +51,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, args.out_args[0].size = sizeof(*outargp); args.out_args[0].value = outargp; - return fuse_simple_request(fc, &args); + return fuse_simple_request(fm, &args); } struct fuse_release_args { @@ -60,7 +60,7 @@ struct fuse_release_args { struct inode *inode; }; -struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) { struct fuse_file *ff; @@ -68,7 +68,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) if (unlikely(!ff)) return NULL; - ff->fc = fc; + ff->fm = fm; ff->release_args = kzalloc(sizeof(*ff->release_args), GFP_KERNEL_ACCOUNT); if (!ff->release_args) { @@ -82,7 +82,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); - ff->kh = atomic64_inc_return(&fc->khctr); + ff->kh = atomic64_inc_return(&fm->fc->khctr); return ff; } @@ -100,7 +100,7 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff) return ff; } -static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args, int error) { struct fuse_release_args *ra = container_of(args, typeof(*ra), args); @@ -114,29 +114,30 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) if (refcount_dec_and_test(&ff->count)) { struct fuse_args *args = &ff->release_args->args; - if (isdir ? ff->fc->no_opendir : ff->fc->no_open) { + if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) { /* Do nothing when client does not implement 'open' */ - fuse_release_end(ff->fc, args, 0); + fuse_release_end(ff->fm, args, 0); } else if (sync) { - fuse_simple_request(ff->fc, args); - fuse_release_end(ff->fc, args, 0); + fuse_simple_request(ff->fm, args); + fuse_release_end(ff->fm, args, 0); } else { args->end = fuse_release_end; - if (fuse_simple_background(ff->fc, args, + if (fuse_simple_background(ff->fm, args, GFP_KERNEL | __GFP_NOFAIL)) - fuse_release_end(ff->fc, args, -ENOTCONN); + fuse_release_end(ff->fm, args, -ENOTCONN); } kfree(ff); } } -int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, +int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir) { + struct fuse_conn *fc = fm->fc; struct fuse_file *ff; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; - ff = fuse_file_alloc(fc); + ff = fuse_file_alloc(fm); if (!ff) return -ENOMEM; @@ -147,7 +148,7 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, struct fuse_open_out outarg; int err; - err = fuse_send_open(fc, nodeid, file, opcode, &outarg); + err = fuse_send_open(fm, nodeid, file, opcode, &outarg); if (!err) { ff->fh = outarg.fh; ff->open_flags = outarg.open_flags; @@ -216,27 +217,40 @@ void fuse_finish_open(struct inode *inode, struct file *file) int fuse_open_common(struct inode *inode, struct file *file, bool isdir) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn *fc = fm->fc; int err; bool is_wb_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc && fc->writeback_cache; + bool dax_truncate = (file->f_flags & O_TRUNC) && + fc->atomic_o_trunc && FUSE_IS_DAX(inode); err = generic_file_open(inode, file); if (err) return err; - if (is_wb_truncate) { + if (is_wb_truncate || dax_truncate) { inode_lock(inode); fuse_set_nowrite(inode); } - err = fuse_do_open(fc, get_node_id(inode), file, isdir); + if (dax_truncate) { + down_write(&get_fuse_inode(inode)->i_mmap_sem); + err = fuse_dax_break_layouts(inode, 0, 0); + if (err) + goto out; + } + err = fuse_do_open(fm, get_node_id(inode), file, isdir); if (!err) fuse_finish_open(inode, file); - if (is_wb_truncate) { +out: + if (dax_truncate) + up_write(&get_fuse_inode(inode)->i_mmap_sem); + + if (is_wb_truncate | dax_truncate) { fuse_release_nowrite(inode); inode_unlock(inode); } @@ -247,7 +261,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, int flags, int opcode) { - struct fuse_conn *fc = ff->fc; + struct fuse_conn *fc = ff->fm->fc; struct fuse_release_args *ra = ff->release_args; /* Inode is NULL on error path of fuse_create_open() */ @@ -285,7 +299,7 @@ void fuse_release_common(struct file *file, bool isdir) if (ff->flock) { ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; - ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc, + ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, (fl_owner_t) file); } /* Hold inode until release is finished */ @@ -300,7 +314,7 @@ void fuse_release_common(struct file *file, bool isdir) * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fc->destroy, isdir); + fuse_file_put(ff, ff->fm->fc->destroy, isdir); } static int fuse_open(struct inode *inode, struct file *file) @@ -443,7 +457,7 @@ static void fuse_sync_writes(struct inode *inode) static int fuse_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; struct fuse_flush_in inarg; FUSE_ARGS(args); @@ -465,12 +479,12 @@ static int fuse_flush(struct file *file, fl_owner_t id) return err; err = 0; - if (fc->no_flush) + if (fm->fc->no_flush) goto inval_attr_out; memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; - inarg.lock_owner = fuse_lock_owner_id(fc, id); + inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); args.opcode = FUSE_FLUSH; args.nodeid = get_node_id(inode); args.in_numargs = 1; @@ -478,9 +492,9 @@ static int fuse_flush(struct file *file, fl_owner_t id) args.in_args[0].value = &inarg; args.force = true; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_flush = 1; + fm->fc->no_flush = 1; err = 0; } @@ -489,7 +503,7 @@ inval_attr_out: * In memory i_blocks is not maintained by fuse, if writeback cache is * enabled, i_blocks from cached attr may not be accurate. */ - if (!err && fc->writeback_cache) + if (!err && fm->fc->writeback_cache) fuse_invalidate_attr(inode); return err; } @@ -498,7 +512,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, int datasync, int opcode) { struct inode *inode = file->f_mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; FUSE_ARGS(args); struct fuse_fsync_in inarg; @@ -511,7 +525,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - return fuse_simple_request(fc, &args); + return fuse_simple_request(fm, &args); } static int fuse_fsync(struct file *file, loff_t start, loff_t end, @@ -686,7 +700,7 @@ static void fuse_io_free(struct fuse_io_args *ia) kfree(ia); } -static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, int err) { struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); @@ -715,7 +729,7 @@ static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args, fuse_io_free(ia); } -static ssize_t fuse_async_req_send(struct fuse_conn *fc, +static ssize_t fuse_async_req_send(struct fuse_mount *fm, struct fuse_io_args *ia, size_t num_bytes) { ssize_t err; @@ -729,9 +743,9 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc, ia->ap.args.end = fuse_aio_complete_req; ia->ap.args.may_block = io->should_dirty; - err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL); + err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL); if (err) - fuse_aio_complete_req(fc, &ia->ap.args, err); + fuse_aio_complete_req(fm, &ia->ap.args, err); return num_bytes; } @@ -741,18 +755,18 @@ static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, { struct file *file = ia->io->iocb->ki_filp; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; fuse_read_args_fill(ia, file, pos, count, FUSE_READ); if (owner != NULL) { ia->read.in.read_flags |= FUSE_READ_LOCKOWNER; - ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner); + ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner); } if (ia->io->async) - return fuse_async_req_send(fc, ia, count); + return fuse_async_req_send(fm, ia, count); - return fuse_simple_request(fc, &ia->ap.args); + return fuse_simple_request(fm, &ia->ap.args); } static void fuse_read_update_size(struct inode *inode, loff_t size, @@ -798,7 +812,7 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, static int fuse_do_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); loff_t pos = page_offset(page); struct fuse_page_desc desc = { .length = PAGE_SIZE }; struct fuse_io_args ia = { @@ -818,14 +832,14 @@ static int fuse_do_readpage(struct file *file, struct page *page) */ fuse_wait_on_page_writeback(inode, page->index); - attr_ver = fuse_get_attr_version(fc); + attr_ver = fuse_get_attr_version(fm->fc); /* Don't overflow end offset */ if (pos + (desc.length - 1) == LLONG_MAX) desc.length--; fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); - res = fuse_simple_request(fc, &ia.ap.args); + res = fuse_simple_request(fm, &ia.ap.args); if (res < 0) return res; /* @@ -855,7 +869,7 @@ static int fuse_readpage(struct file *file, struct page *page) return err; } -static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, int err) { int i; @@ -899,7 +913,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args, static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) { struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; struct fuse_args_pages *ap = &ia->ap; loff_t pos = page_offset(ap->pages[0]); size_t count = ap->num_pages << PAGE_SHIFT; @@ -918,18 +932,18 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) WARN_ON((loff_t) (pos + count) < 0); fuse_read_args_fill(ia, file, pos, count, FUSE_READ); - ia->read.attr_ver = fuse_get_attr_version(fc); - if (fc->async_read) { + ia->read.attr_ver = fuse_get_attr_version(fm->fc); + if (fm->fc->async_read) { ia->ff = fuse_file_get(ff); ap->args.end = fuse_readpages_end; - err = fuse_simple_background(fc, &ap->args, GFP_KERNEL); + err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); if (!err) return; } else { - res = fuse_simple_request(fc, &ap->args); + res = fuse_simple_request(fm, &ap->args); err = res < 0 ? res : 0; } - fuse_readpages_end(fc, &ap->args, err); + fuse_readpages_end(fm, &ap->args, err); } static void fuse_readahead(struct readahead_control *rac) @@ -1000,7 +1014,7 @@ static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, args->opcode = FUSE_WRITE; args->nodeid = ff->nodeid; args->in_numargs = 2; - if (ff->fc->minor < 9) + if (ff->fm->fc->minor < 9) args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; else args->in_args[0].size = sizeof(ia->write.in); @@ -1029,7 +1043,7 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, struct kiocb *iocb = ia->io->iocb; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; struct fuse_write_in *inarg = &ia->write.in; ssize_t err; @@ -1037,13 +1051,13 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, inarg->flags = fuse_write_flags(iocb); if (owner != NULL) { inarg->write_flags |= FUSE_WRITE_LOCKOWNER; - inarg->lock_owner = fuse_lock_owner_id(fc, owner); + inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner); } if (ia->io->async) - return fuse_async_req_send(fc, ia, count); + return fuse_async_req_send(fm, ia, count); - err = fuse_simple_request(fc, &ia->ap.args); + err = fuse_simple_request(fm, &ia->ap.args); if (!err && ia->write.out.size > count) err = -EIO; @@ -1074,7 +1088,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, struct fuse_args_pages *ap = &ia->ap; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; unsigned int offset, i; int err; @@ -1084,7 +1098,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); - err = fuse_simple_request(fc, &ap->args); + err = fuse_simple_request(fm, &ap->args); if (!err && ia->write.out.size > count) err = -EIO; @@ -1399,7 +1413,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, struct file *file = io->iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_conn *fc = ff->fm->fc; size_t nmax = write ? fc->max_write : fc->max_read; loff_t pos = *ppos; size_t count = iov_iter_count(iter); @@ -1539,10 +1553,14 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); - if (is_bad_inode(file_inode(file))) + if (is_bad_inode(inode)) return -EIO; + if (FUSE_IS_DAX(inode)) + return fuse_dax_read_iter(iocb, to); + if (!(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_cache_read_iter(iocb, to); else @@ -1553,10 +1571,14 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); - if (is_bad_inode(file_inode(file))) + if (is_bad_inode(inode)) return -EIO; + if (FUSE_IS_DAX(inode)) + return fuse_dax_write_iter(iocb, from); + if (!(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_cache_write_iter(iocb, from); else @@ -1578,7 +1600,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) kfree(wpa); } -static void fuse_writepage_finish(struct fuse_conn *fc, +static void fuse_writepage_finish(struct fuse_mount *fm, struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; @@ -1596,7 +1618,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, } /* Called under fi->lock, may release and reacquire it */ -static void fuse_send_writepage(struct fuse_conn *fc, +static void fuse_send_writepage(struct fuse_mount *fm, struct fuse_writepage_args *wpa, loff_t size) __releases(fi->lock) __acquires(fi->lock) @@ -1622,10 +1644,10 @@ __acquires(fi->lock) args->force = true; args->nocreds = true; - err = fuse_simple_background(fc, args, GFP_ATOMIC); + err = fuse_simple_background(fm, args, GFP_ATOMIC); if (err == -ENOMEM) { spin_unlock(&fi->lock); - err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL); + err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL); spin_lock(&fi->lock); } @@ -1638,7 +1660,7 @@ __acquires(fi->lock) out_free: fi->writectr--; rb_erase(&wpa->writepages_entry, &fi->writepages); - fuse_writepage_finish(fc, wpa); + fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); /* After fuse_writepage_finish() aux request list is private */ @@ -1662,7 +1684,7 @@ void fuse_flush_writepages(struct inode *inode) __releases(fi->lock) __acquires(fi->lock) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_inode *fi = get_fuse_inode(inode); loff_t crop = i_size_read(inode); struct fuse_writepage_args *wpa; @@ -1671,7 +1693,7 @@ __acquires(fi->lock) wpa = list_entry(fi->queued_writes.next, struct fuse_writepage_args, queue_entry); list_del_init(&wpa->queue_entry); - fuse_send_writepage(fc, wpa, crop); + fuse_send_writepage(fm, wpa, crop); } } @@ -1712,7 +1734,7 @@ static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) WARN_ON(fuse_insert_writeback(root, wpa)); } -static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, int error) { struct fuse_writepage_args *wpa = @@ -1724,7 +1746,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, spin_lock(&fi->lock); rb_erase(&wpa->writepages_entry, &fi->writepages); while (wpa->next) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_write_in *inarg = &wpa->ia.write.in; struct fuse_writepage_args *next = wpa->next; @@ -1756,10 +1778,10 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, * no invocations of fuse_writepage_end() while we're in * fuse_set_nowrite..fuse_release_nowrite section. */ - fuse_send_writepage(fc, next, inarg->offset + inarg->size); + fuse_send_writepage(fm, next, inarg->offset + inarg->size); } fi->writectr--; - fuse_writepage_finish(fc, wpa); + fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); fuse_writepage_free(wpa); } @@ -2317,6 +2339,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { struct fuse_file *ff = file->private_data; + /* DAX mmap is superior to direct_io mmap */ + if (FUSE_IS_DAX(file_inode(file))) + return fuse_dax_mmap(file, vma); + if (ff->open_flags & FOPEN_DIRECT_IO) { /* Can't provide the coherency needed for MAP_SHARED */ if (vma->vm_flags & VM_MAYSHARE) @@ -2395,7 +2421,7 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file, static int fuse_getlk(struct file *file, struct file_lock *fl) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_lk_in inarg; struct fuse_lk_out outarg; @@ -2405,9 +2431,9 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) - err = convert_fuse_file_lock(fc, &outarg.lk, fl); + err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl); return err; } @@ -2415,12 +2441,12 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_lk_in inarg; int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL; - pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns); + pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns); int err; if (fl->fl_lmops && fl->fl_lmops->lm_grant) { @@ -2433,7 +2459,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) return 0; fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); /* locking is restartable */ if (err == -EINTR) @@ -2487,13 +2513,13 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) static sector_t fuse_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_bmap_in inarg; struct fuse_bmap_out outarg; int err; - if (!inode->i_sb->s_bdev || fc->no_bmap) + if (!inode->i_sb->s_bdev || fm->fc->no_bmap) return 0; memset(&inarg, 0, sizeof(inarg)); @@ -2507,9 +2533,9 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) - fc->no_bmap = 1; + fm->fc->no_bmap = 1; return err ? 0 : outarg.block; } @@ -2517,7 +2543,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; FUSE_ARGS(args); struct fuse_lseek_in inarg = { @@ -2528,7 +2554,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) struct fuse_lseek_out outarg; int err; - if (fc->no_lseek) + if (fm->fc->no_lseek) goto fallback; args.opcode = FUSE_LSEEK; @@ -2539,10 +2565,10 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err) { if (err == -ENOSYS) { - fc->no_lseek = 1; + fm->fc->no_lseek = 1; goto fallback; } return err; @@ -2728,7 +2754,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags) { struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; struct fuse_ioctl_in inarg = { .fh = ff->fh, .cmd = cmd, @@ -2761,12 +2787,12 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs); + ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); if (!ap.pages || !iov_page) goto out; - fuse_page_descs_length_init(ap.descs, 0, fc->max_pages); + fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages); /* * If restricted, initialize IO parameters as encoded in @cmd. @@ -2811,7 +2837,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, /* make sure there are enough buffer pages and init request with them */ err = -ENOMEM; - if (max_pages > fc->max_pages) + if (max_pages > fm->fc->max_pages) goto out; while (ap.num_pages < max_pages) { ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); @@ -2848,7 +2874,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, ap.args.out_pages = true; ap.args.out_argvar = true; - transferred = fuse_simple_request(fc, &ap.args); + transferred = fuse_simple_request(fm, &ap.args); err = transferred; if (transferred < 0) goto out; @@ -2876,7 +2902,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, goto out; vaddr = kmap_atomic(ap.pages[0]); - err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr, + err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr, transferred, in_iovs + out_iovs, (flags & FUSE_IOCTL_COMPAT) != 0); kunmap_atomic(vaddr); @@ -2886,11 +2912,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iov = iov_page; out_iov = in_iov + in_iovs; - err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs); + err = fuse_verify_ioctl_iov(fm->fc, in_iov, in_iovs); if (err) goto out; - err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs); + err = fuse_verify_ioctl_iov(fm->fc, out_iov, out_iovs); if (err) goto out; @@ -3000,13 +3026,13 @@ static void fuse_register_polled_file(struct fuse_conn *fc, __poll_t fuse_file_poll(struct file *file, poll_table *wait) { struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; struct fuse_poll_out outarg; FUSE_ARGS(args); int err; - if (fc->no_poll) + if (fm->fc->no_poll) return DEFAULT_POLLMASK; poll_wait(file, &ff->poll_wait, wait); @@ -3018,7 +3044,7 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait) */ if (waitqueue_active(&ff->poll_wait)) { inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY; - fuse_register_polled_file(fc, ff); + fuse_register_polled_file(fm->fc, ff); } args.opcode = FUSE_POLL; @@ -3029,12 +3055,12 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) return demangle_poll(outarg.revents); if (err == -ENOSYS) { - fc->no_poll = 1; + fm->fc->no_poll = 1; return DEFAULT_POLLMASK; } return EPOLLERR; @@ -3120,13 +3146,13 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * By default, we want to optimize all I/Os with async request * submission to the client filesystem if supported. */ - io->async = ff->fc->async_dio; + io->async = ff->fm->fc->async_dio; io->iocb = iocb; io->blocking = is_sync_kiocb(iocb); /* optimization for short read */ if (io->async && !io->write && offset + count > i_size) { - iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset)); + iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset)); shortened = count - iov_iter_count(iter); count -= shortened; } @@ -3196,7 +3222,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, struct fuse_file *ff = file->private_data; struct inode *inode = file_inode(file); struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; FUSE_ARGS(args); struct fuse_fallocate_in inarg = { .fh = ff->fh, @@ -3208,14 +3234,23 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || (mode & FALLOC_FL_PUNCH_HOLE); + bool block_faults = FUSE_IS_DAX(inode) && lock_inode; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; - if (fc->no_fallocate) + if (fm->fc->no_fallocate) return -EOPNOTSUPP; if (lock_inode) { inode_lock(inode); + if (block_faults) { + down_write(&fi->i_mmap_sem); + err = fuse_dax_break_layouts(inode, 0, 0); + if (err) + goto out; + } + if (mode & FALLOC_FL_PUNCH_HOLE) { loff_t endbyte = offset + length - 1; @@ -3240,9 +3275,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_fallocate = 1; + fm->fc->no_fallocate = 1; err = -EOPNOTSUPP; } if (err) @@ -3252,7 +3287,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE)) { bool changed = fuse_write_update_size(inode, offset + length); - if (changed && fc->writeback_cache) + if (changed && fm->fc->writeback_cache) file_update_time(file); } @@ -3265,6 +3300,9 @@ out: if (!(mode & FALLOC_FL_KEEP_SIZE)) clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + if (block_faults) + up_write(&fi->i_mmap_sem); + if (lock_inode) inode_unlock(inode); @@ -3280,7 +3318,8 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); struct fuse_inode *fi_out = get_fuse_inode(inode_out); - struct fuse_conn *fc = ff_in->fc; + struct fuse_mount *fm = ff_in->fm; + struct fuse_conn *fc = fm->fc; FUSE_ARGS(args); struct fuse_copy_file_range_in inarg = { .fh_in = ff_in->fh, @@ -3349,7 +3388,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fc->no_copy_file_range = 1; err = -EOPNOTSUPP; @@ -3404,6 +3443,7 @@ static const struct file_operations fuse_file_operations = { .release = fuse_release, .fsync = fuse_fsync, .lock = fuse_file_lock, + .get_unmapped_area = thp_get_unmapped_area, .flock = fuse_file_flock, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, @@ -3439,4 +3479,7 @@ void fuse_init_file_inode(struct inode *inode) fi->writectr = 0; init_waitqueue_head(&fi->page_waitq); fi->writepages = RB_ROOT; + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_inode_init(inode); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 740a8a7d7ae6..d51598017d13 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -148,6 +148,20 @@ struct fuse_inode { /** Lock to protect write related fields */ spinlock_t lock; + + /** + * Can't take inode lock in fault path (leads to circular dependency). + * Introduce another semaphore which can be taken in fault path and + * then other filesystem paths can take this to block faults. + */ + struct rw_semaphore i_mmap_sem; + +#ifdef CONFIG_FUSE_DAX + /* + * Dax specific inode data + */ + struct fuse_inode_dax *dax; +#endif }; /** FUSE inode state bits */ @@ -161,12 +175,13 @@ enum { }; struct fuse_conn; +struct fuse_mount; struct fuse_release_args; /** FUSE specific file data */ struct fuse_file { /** Fuse connection for this file */ - struct fuse_conn *fc; + struct fuse_mount *fm; /* Argument space reserved for release */ struct fuse_release_args *release_args; @@ -252,7 +267,7 @@ struct fuse_args { bool may_block:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; - void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error); + void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); }; struct fuse_args_pages { @@ -360,6 +375,9 @@ struct fuse_req { /** virtio-fs's physically contiguous buffer for in and out args */ void *argbuf; #endif + + /** fuse_mount this request belongs to */ + struct fuse_mount *fm; }; struct fuse_iqueue; @@ -482,11 +500,15 @@ struct fuse_fs_context { bool destroy:1; bool no_control:1; bool no_force_umount:1; - bool no_mount_options:1; + bool legacy_opts_show:1; + bool dax:1; unsigned int max_read; unsigned int blksize; const char *subtype; + /* DAX device, may be NULL */ + struct dax_device *dax_dev; + /* fuse_dev pointer to fill in, should contain NULL on entry */ void **fudptr; }; @@ -494,9 +516,9 @@ struct fuse_fs_context { /** * A Fuse connection. * - * This structure is created, when the filesystem is mounted, and is - * destroyed, when the client device is closed and the filesystem is - * unmounted. + * This structure is created, when the root filesystem is mounted, and + * is destroyed, when the client device is closed and the last + * fuse_mount is destroyed. */ struct fuse_conn { /** Lock protecting accessess to members of this structure */ @@ -610,6 +632,9 @@ struct fuse_conn { /** cache READLINK responses in page cache */ unsigned cache_symlinks:1; + /* show legacy mount options */ + unsigned int legacy_opts_show:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -717,8 +742,8 @@ struct fuse_conn { /** Do not allow MNT_FORCE umount */ unsigned int no_force_umount:1; - /* Do not show mount options */ - unsigned int no_mount_options:1; + /* Auto-mount submounts announced by the server */ + unsigned int auto_submounts:1; /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -726,10 +751,10 @@ struct fuse_conn { /** Negotiated minor version */ unsigned minor; - /** Entry on the fuse_conn_list */ + /** Entry on the fuse_mount_list */ struct list_head entry; - /** Device ID from super block */ + /** Device ID from the root super block */ dev_t dev; /** Dentries in the control filesystem */ @@ -747,24 +772,70 @@ struct fuse_conn { /** Called on final put */ void (*release)(struct fuse_conn *); - /** Super block for this connection. */ - struct super_block *sb; - - /** Read/write semaphore to hold when accessing sb. */ + /** + * Read/write semaphore to hold when accessing the sb of any + * fuse_mount belonging to this connection + */ struct rw_semaphore killsb; /** List of device instances belonging to this connection */ struct list_head devices; + +#ifdef CONFIG_FUSE_DAX + /* Dax specific conn data, non-NULL if DAX is enabled */ + struct fuse_conn_dax *dax; +#endif + + /** List of filesystems using this connection */ + struct list_head mounts; }; -static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) +/* + * Represents a mounted filesystem, potentially a submount. + * + * This object allows sharing a fuse_conn between separate mounts to + * allow submounts with dedicated superblocks and thus separate device + * IDs. + */ +struct fuse_mount { + /* Underlying (potentially shared) connection to the FUSE server */ + struct fuse_conn *fc; + + /* Refcount */ + refcount_t count; + + /* + * Super block for this connection (fc->killsb must be held when + * accessing this). + */ + struct super_block *sb; + + /* Entry on fc->mounts */ + struct list_head fc_entry; +}; + +static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb) { return sb->s_fs_info; } +static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + + return fm ? fm->fc : NULL; +} + +static inline struct fuse_mount *get_fuse_mount(struct inode *inode) +{ + return get_fuse_mount_super(inode->i_sb); +} + static inline struct fuse_conn *get_fuse_conn(struct inode *inode) { - return get_fuse_conn_super(inode->i_sb); + struct fuse_mount *fm = get_fuse_mount(inode); + + return fm ? fm->fc : NULL; } static inline struct fuse_inode *get_fuse_inode(struct inode *inode) @@ -794,11 +865,6 @@ extern const struct dentry_operations fuse_dentry_operations; extern const struct dentry_operations fuse_root_dentry_operations; /** - * Inode to nodeid comparison. - */ -int fuse_inode_eq(struct inode *inode, void *_nodeidp); - -/** * Get a filled in inode */ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, @@ -848,7 +914,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, */ int fuse_open_common(struct inode *inode, struct file *file, bool isdir); -struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm); void fuse_file_free(struct fuse_file *ff); void fuse_finish_open(struct inode *inode, struct file *file); @@ -916,14 +982,14 @@ void __exit fuse_ctl_cleanup(void); /** * Simple request sending that does request allocation and freeing */ -ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); -int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, +ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args); +int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags); /** * End a finished request */ -void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req); +void fuse_request_end(struct fuse_req *req); /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); @@ -949,7 +1015,8 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); /** * Initialize fuse_conn */ -void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, +void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, + struct user_namespace *user_ns, const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv); /** @@ -957,11 +1024,21 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, */ void fuse_conn_put(struct fuse_conn *fc); +/** + * Acquire reference to fuse_mount + */ +struct fuse_mount *fuse_mount_get(struct fuse_mount *fm); + +/** + * Release reference to fuse_mount + */ +void fuse_mount_put(struct fuse_mount *fm); + struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); struct fuse_dev *fuse_dev_alloc(void); void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); -void fuse_send_init(struct fuse_conn *fc); +void fuse_send_init(struct fuse_mount *fm); /** * Fill in superblock and initialize fuse connection @@ -970,12 +1047,26 @@ void fuse_send_init(struct fuse_conn *fc); */ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx); -/** - * Disassociate fuse connection from superblock and kill the superblock +/* + * Fill in superblock for submounts + * @sb: partially-initialized superblock to fill in + * @parent_fi: The fuse_inode of the parent filesystem where this submount is + * mounted + */ +int fuse_fill_super_submount(struct super_block *sb, + struct fuse_inode *parent_fi); + +/* + * Remove the mount from the connection * - * Calls kill_anon_super(), do not use with bdev mounts. + * Returns whether this was the last mount */ -void fuse_kill_sb_anon(struct super_block *sb); +bool fuse_mount_remove(struct fuse_mount *fm); + +/* + * Shut down the connection (possibly sending DESTROY request). + */ +void fuse_conn_destroy(struct fuse_mount *fm); /** * Add connection to control filesystem @@ -1011,9 +1102,19 @@ void fuse_set_nowrite(struct inode *inode); void fuse_release_nowrite(struct inode *inode); /** + * Scan all fuse_mounts belonging to fc to find the first where + * ilookup5() returns a result. Return that result and the + * respective fuse_mount in *fm (unless fm is NULL). + * + * The caller must hold fc->killsb. + */ +struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, + struct fuse_mount **fm); + +/** * File-system tells the kernel to invalidate cache for the given node id. */ -int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, +int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, loff_t offset, loff_t len); /** @@ -1026,10 +1127,10 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, * - is a file or oan empty directory * then the dentry is unhashed (d_delete()). */ -int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, +int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, u64 child_nodeid, struct qstr *name); -int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, +int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir); /** @@ -1093,4 +1194,20 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args); u64 fuse_get_unique(struct fuse_iqueue *fiq); void fuse_free_conn(struct fuse_conn *fc); +/* dax.c */ + +#define FUSE_IS_DAX(inode) (IS_ENABLED(CONFIG_FUSE_DAX) && IS_DAX(inode)) + +ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); +ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); +int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma); +int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end); +int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev); +void fuse_dax_conn_free(struct fuse_conn *fc); +bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi); +void fuse_dax_inode_init(struct inode *inode); +void fuse_dax_inode_cleanup(struct inode *inode); +bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment); +void fuse_dax_cancel_work(struct fuse_conn *fc); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 581329203d68..1a47afc95f80 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -85,14 +85,22 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->orig_ino = 0; fi->state = 0; mutex_init(&fi->mutex); + init_rwsem(&fi->i_mmap_sem); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); - if (!fi->forget) { - kmem_cache_free(fuse_inode_cachep, fi); - return NULL; - } + if (!fi->forget) + goto out_free; + + if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi)) + goto out_free_forget; return &fi->inode; + +out_free_forget: + kfree(fi->forget); +out_free: + kmem_cache_free(fuse_inode_cachep, fi); + return NULL; } static void fuse_free_inode(struct inode *inode) @@ -101,6 +109,9 @@ static void fuse_free_inode(struct inode *inode) mutex_destroy(&fi->mutex); kfree(fi->forget); +#ifdef CONFIG_FUSE_DAX + kfree(fi->dax); +#endif kmem_cache_free(fuse_inode_cachep, fi); } @@ -112,8 +123,14 @@ static void fuse_evict_inode(struct inode *inode) clear_inode(inode); if (inode->i_sb->s_flags & SB_ACTIVE) { struct fuse_conn *fc = get_fuse_conn(inode); - fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); - fi->forget = NULL; + + if (FUSE_IS_DAX(inode)) + fuse_dax_inode_cleanup(inode); + if (fi->nlookup) { + fuse_queue_forget(fc, fi->forget, fi->nodeid, + fi->nlookup); + fi->forget = NULL; + } } if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) { WARN_ON(!list_empty(&fi->write_files)); @@ -268,7 +285,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) BUG(); } -int fuse_inode_eq(struct inode *inode, void *_nodeidp) +static int fuse_inode_eq(struct inode *inode, void *_nodeidp) { u64 nodeid = *(u64 *) _nodeidp; if (get_node_id(inode) == nodeid) @@ -292,7 +309,26 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, struct fuse_inode *fi; struct fuse_conn *fc = get_fuse_conn_super(sb); - retry: + /* + * Auto mount points get their node id from the submount root, which is + * not a unique identifier within this filesystem. + * + * To avoid conflicts, do not place submount points into the inode hash + * table. + */ + if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) && + S_ISDIR(attr->mode)) { + inode = new_inode(sb); + if (!inode) + return NULL; + + fuse_init_inode(inode, attr); + get_fuse_inode(inode)->nodeid = nodeid; + inode->i_flags |= S_AUTOMOUNT; + goto done; + } + +retry: inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid); if (!inode) return NULL; @@ -310,7 +346,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, iput(inode); goto retry; } - +done: fi = get_fuse_inode(inode); spin_lock(&fi->lock); fi->nlookup++; @@ -320,16 +356,37 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, return inode; } -int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, +struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, + struct fuse_mount **fm) +{ + struct fuse_mount *fm_iter; + struct inode *inode; + + WARN_ON(!rwsem_is_locked(&fc->killsb)); + list_for_each_entry(fm_iter, &fc->mounts, fc_entry) { + if (!fm_iter->sb) + continue; + + inode = ilookup5(fm_iter->sb, nodeid, fuse_inode_eq, &nodeid); + if (inode) { + if (fm) + *fm = fm_iter; + return inode; + } + } + + return NULL; +} + +int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, loff_t offset, loff_t len) { - struct fuse_conn *fc = get_fuse_conn_super(sb); struct fuse_inode *fi; struct inode *inode; pgoff_t pg_start; pgoff_t pg_end; - inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid); + inode = fuse_ilookup(fc, nodeid, NULL); if (!inode) return -ENOENT; @@ -379,28 +436,23 @@ static void fuse_umount_begin(struct super_block *sb) fuse_abort_conn(fc); } -static void fuse_send_destroy(struct fuse_conn *fc) +static void fuse_send_destroy(struct fuse_mount *fm) { - if (fc->conn_init) { + if (fm->fc->conn_init) { FUSE_ARGS(args); args.opcode = FUSE_DESTROY; args.force = true; args.nocreds = true; - fuse_simple_request(fc, &args); + fuse_simple_request(fm, &args); } } static void fuse_put_super(struct super_block *sb) { - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); - mutex_lock(&fuse_mutex); - list_del(&fc->entry); - fuse_ctl_remove_conn(fc); - mutex_unlock(&fuse_mutex); - - fuse_conn_put(fc); + fuse_mount_put(fm); } static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) @@ -420,12 +472,12 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); struct fuse_statfs_out outarg; int err; - if (!fuse_allow_current_process(fc)) { + if (!fuse_allow_current_process(fm->fc)) { buf->f_type = FUSE_SUPER_MAGIC; return 0; } @@ -437,7 +489,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) convert_fuse_statfs(buf, &outarg.st); return err; @@ -573,19 +625,25 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) struct super_block *sb = root->d_sb; struct fuse_conn *fc = get_fuse_conn_super(sb); - if (fc->no_mount_options) - return 0; + if (fc->legacy_opts_show) { + seq_printf(m, ",user_id=%u", + from_kuid_munged(fc->user_ns, fc->user_id)); + seq_printf(m, ",group_id=%u", + from_kgid_munged(fc->user_ns, fc->group_id)); + if (fc->default_permissions) + seq_puts(m, ",default_permissions"); + if (fc->allow_other) + seq_puts(m, ",allow_other"); + if (fc->max_read != ~0) + seq_printf(m, ",max_read=%u", fc->max_read); + if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) + seq_printf(m, ",blksize=%lu", sb->s_blocksize); + } +#ifdef CONFIG_FUSE_DAX + if (fc->dax) + seq_puts(m, ",dax"); +#endif - seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id)); - seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id)); - if (fc->default_permissions) - seq_puts(m, ",default_permissions"); - if (fc->allow_other) - seq_puts(m, ",allow_other"); - if (fc->max_read != ~0) - seq_printf(m, ",max_read=%u", fc->max_read); - if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) - seq_printf(m, ",blksize=%lu", sb->s_blocksize); return 0; } @@ -615,7 +673,8 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq) fpq->connected = 1; } -void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, +void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, + struct user_namespace *user_ns, const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv) { memset(fc, 0, sizeof(*fc)); @@ -642,6 +701,11 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; + + INIT_LIST_HEAD(&fc->mounts); + list_add(&fm->fc_entry, &fc->mounts); + fm->fc = fc; + refcount_set(&fm->count, 1); } EXPORT_SYMBOL_GPL(fuse_conn_init); @@ -650,6 +714,8 @@ void fuse_conn_put(struct fuse_conn *fc) if (refcount_dec_and_test(&fc->count)) { struct fuse_iqueue *fiq = &fc->iq; + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_conn_free(fc); if (fiq->ops->release) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); @@ -666,6 +732,23 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) } EXPORT_SYMBOL_GPL(fuse_conn_get); +void fuse_mount_put(struct fuse_mount *fm) +{ + if (refcount_dec_and_test(&fm->count)) { + if (fm->fc) + fuse_conn_put(fm->fc); + kfree(fm); + } +} +EXPORT_SYMBOL_GPL(fuse_mount_put); + +struct fuse_mount *fuse_mount_get(struct fuse_mount *fm) +{ + refcount_inc(&fm->count); + return fm; +} +EXPORT_SYMBOL_GPL(fuse_mount_get); + static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) { struct fuse_attr attr; @@ -895,14 +978,16 @@ struct fuse_init_args { struct fuse_init_out out; }; -static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, +static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, int error) { + struct fuse_conn *fc = fm->fc; struct fuse_init_args *ia = container_of(args, typeof(*ia), args); struct fuse_init_out *arg = &ia->out; + bool ok = true; if (error || arg->major != FUSE_KERNEL_VERSION) - fc->conn_error = 1; + ok = false; else { unsigned long ra_pages; @@ -950,11 +1035,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, if (arg->flags & FUSE_HANDLE_KILLPRIV) fc->handle_killpriv = 1; if (arg->time_gran && arg->time_gran <= 1000000000) - fc->sb->s_time_gran = arg->time_gran; + fm->sb->s_time_gran = arg->time_gran; if ((arg->flags & FUSE_POSIX_ACL)) { fc->default_permissions = 1; fc->posix_acl = 1; - fc->sb->s_xattr = fuse_acl_xattr_handlers; + fm->sb->s_xattr = fuse_acl_xattr_handlers; } if (arg->flags & FUSE_CACHE_SYMLINKS) fc->cache_symlinks = 1; @@ -965,14 +1050,19 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, min_t(unsigned int, FUSE_MAX_MAX_PAGES, max_t(unsigned int, arg->max_pages, 1)); } + if (IS_ENABLED(CONFIG_FUSE_DAX) && + arg->flags & FUSE_MAP_ALIGNMENT && + !fuse_dax_check_alignment(fc, arg->map_alignment)) { + ok = false; + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; fc->no_flock = 1; } - fc->sb->s_bdi->ra_pages = - min(fc->sb->s_bdi->ra_pages, ra_pages); + fm->sb->s_bdi->ra_pages = + min(fm->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; fc->max_write = max_t(unsigned, 4096, fc->max_write); @@ -980,11 +1070,16 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, } kfree(ia); + if (!ok) { + fc->conn_init = 0; + fc->conn_error = 1; + } + fuse_set_initialized(fc); wake_up_all(&fc->blocked_waitq); } -void fuse_send_init(struct fuse_conn *fc) +void fuse_send_init(struct fuse_mount *fm) { struct fuse_init_args *ia; @@ -992,7 +1087,7 @@ void fuse_send_init(struct fuse_conn *fc) ia->in.major = FUSE_KERNEL_VERSION; ia->in.minor = FUSE_KERNEL_MINOR_VERSION; - ia->in.max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE; + ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE; ia->in.flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | @@ -1003,6 +1098,13 @@ void fuse_send_init(struct fuse_conn *fc) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA; +#ifdef CONFIG_FUSE_DAX + if (fm->fc->dax) + ia->in.flags |= FUSE_MAP_ALIGNMENT; +#endif + if (fm->fc->auto_submounts) + ia->in.flags |= FUSE_SUBMOUNTS; + ia->args.opcode = FUSE_INIT; ia->args.in_numargs = 1; ia->args.in_args[0].size = sizeof(ia->in); @@ -1018,8 +1120,8 @@ void fuse_send_init(struct fuse_conn *fc) ia->args.nocreds = true; ia->args.end = process_init_reply; - if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0) - process_init_reply(fc, &ia->args, -ENOTCONN); + if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0) + process_init_reply(fm, &ia->args, -ENOTCONN); } EXPORT_SYMBOL_GPL(fuse_send_init); @@ -1130,10 +1232,92 @@ void fuse_dev_free(struct fuse_dev *fud) } EXPORT_SYMBOL_GPL(fuse_dev_free); +static void fuse_fill_attr_from_inode(struct fuse_attr *attr, + const struct fuse_inode *fi) +{ + *attr = (struct fuse_attr){ + .ino = fi->inode.i_ino, + .size = fi->inode.i_size, + .blocks = fi->inode.i_blocks, + .atime = fi->inode.i_atime.tv_sec, + .mtime = fi->inode.i_mtime.tv_sec, + .ctime = fi->inode.i_ctime.tv_sec, + .atimensec = fi->inode.i_atime.tv_nsec, + .mtimensec = fi->inode.i_mtime.tv_nsec, + .ctimensec = fi->inode.i_ctime.tv_nsec, + .mode = fi->inode.i_mode, + .nlink = fi->inode.i_nlink, + .uid = fi->inode.i_uid.val, + .gid = fi->inode.i_gid.val, + .rdev = fi->inode.i_rdev, + .blksize = 1u << fi->inode.i_blkbits, + }; +} + +static void fuse_sb_defaults(struct super_block *sb) +{ + sb->s_magic = FUSE_SUPER_MAGIC; + sb->s_op = &fuse_super_operations; + sb->s_xattr = fuse_xattr_handlers; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_time_gran = 1; + sb->s_export_op = &fuse_export_operations; + sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; + if (sb->s_user_ns != &init_user_ns) + sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; + sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); + + /* + * If we are not in the initial user namespace posix + * acls must be translated. + */ + if (sb->s_user_ns != &init_user_ns) + sb->s_xattr = fuse_no_acl_xattr_handlers; +} + +int fuse_fill_super_submount(struct super_block *sb, + struct fuse_inode *parent_fi) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct super_block *parent_sb = parent_fi->inode.i_sb; + struct fuse_attr root_attr; + struct inode *root; + + fuse_sb_defaults(sb); + fm->sb = sb; + + WARN_ON(sb->s_bdi != &noop_backing_dev_info); + sb->s_bdi = bdi_get(parent_sb->s_bdi); + + sb->s_xattr = parent_sb->s_xattr; + sb->s_time_gran = parent_sb->s_time_gran; + sb->s_blocksize = parent_sb->s_blocksize; + sb->s_blocksize_bits = parent_sb->s_blocksize_bits; + sb->s_subtype = kstrdup(parent_sb->s_subtype, GFP_KERNEL); + if (parent_sb->s_subtype && !sb->s_subtype) + return -ENOMEM; + + fuse_fill_attr_from_inode(&root_attr, parent_fi); + root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0); + /* + * This inode is just a duplicate, so it is not looked up and + * its nlookup should not be incremented. fuse_iget() does + * that, though, so undo it here. + */ + get_fuse_inode(root)->nlookup--; + sb->s_d_op = &fuse_dentry_operations; + sb->s_root = d_make_root(root); + if (!sb->s_root) + return -ENOMEM; + + return 0; +} + int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) { struct fuse_dev *fud = NULL; - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct fuse_conn *fc = fm->fc; struct inode *root; struct dentry *root_dentry; int err; @@ -1142,7 +1326,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) if (sb->s_flags & SB_MANDLOCK) goto err; - sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); + fuse_sb_defaults(sb); if (ctx->is_bdev) { #ifdef CONFIG_BLOCK @@ -1157,32 +1341,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) sb->s_subtype = ctx->subtype; ctx->subtype = NULL; - sb->s_magic = FUSE_SUPER_MAGIC; - sb->s_op = &fuse_super_operations; - sb->s_xattr = fuse_xattr_handlers; - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_time_gran = 1; - sb->s_export_op = &fuse_export_operations; - sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; - if (sb->s_user_ns != &init_user_ns) - sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; - - /* - * If we are not in the initial user namespace posix - * acls must be translated. - */ - if (sb->s_user_ns != &init_user_ns) - sb->s_xattr = fuse_no_acl_xattr_handlers; + if (IS_ENABLED(CONFIG_FUSE_DAX)) { + err = fuse_dax_conn_alloc(fc, ctx->dax_dev); + if (err) + goto err; + } if (ctx->fudptr) { err = -ENOMEM; fud = fuse_dev_alloc_install(fc); if (!fud) - goto err; + goto err_free_dax; } fc->dev = sb->s_dev; - fc->sb = sb; + fm->sb = sb; err = fuse_bdi_init(fc, sb); if (err) goto err_dev_free; @@ -1196,11 +1369,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) fc->allow_other = ctx->allow_other; fc->user_id = ctx->user_id; fc->group_id = ctx->group_id; - fc->max_read = max_t(unsigned, 4096, ctx->max_read); + fc->legacy_opts_show = ctx->legacy_opts_show; + fc->max_read = max_t(unsigned int, 4096, ctx->max_read); fc->destroy = ctx->destroy; fc->no_control = ctx->no_control; fc->no_force_umount = ctx->no_force_umount; - fc->no_mount_options = ctx->no_mount_options; err = -ENOMEM; root = fuse_get_root_inode(sb, ctx->rootmode); @@ -1233,6 +1406,9 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) err_dev_free: if (fud) fuse_dev_free(fud); + err_free_dax: + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_conn_free(fc); err: return err; } @@ -1244,6 +1420,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) struct file *file; int err; struct fuse_conn *fc; + struct fuse_mount *fm; err = -EINVAL; file = fget(ctx->fd); @@ -1264,9 +1441,16 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) if (!fc) goto err_fput; - fuse_conn_init(fc, sb->s_user_ns, &fuse_dev_fiq_ops, NULL); + fm = kzalloc(sizeof(*fm), GFP_KERNEL); + if (!fm) { + kfree(fc); + goto err_fput; + } + + fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL); fc->release = fuse_free_conn; - sb->s_fs_info = fc; + + sb->s_fs_info = fm; err = fuse_fill_super_common(sb, ctx); if (err) @@ -1277,11 +1461,11 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) * CPUs after this */ fput(file); - fuse_send_init(get_fuse_conn_super(sb)); + fuse_send_init(get_fuse_mount_super(sb)); return 0; err_put_conn: - fuse_conn_put(fc); + fuse_mount_put(fm); sb->s_fs_info = NULL; err_fput: fput(file); @@ -1325,6 +1509,7 @@ static int fuse_init_fs_context(struct fs_context *fc) ctx->max_read = ~0; ctx->blksize = FUSE_DEFAULT_BLKSIZE; + ctx->legacy_opts_show = true; #ifdef CONFIG_BLOCK if (fc->fs_type == &fuseblk_fs_type) { @@ -1338,29 +1523,52 @@ static int fuse_init_fs_context(struct fs_context *fc) return 0; } -static void fuse_sb_destroy(struct super_block *sb) +bool fuse_mount_remove(struct fuse_mount *fm) { - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_conn *fc = fm->fc; + bool last = false; - if (fc) { - if (fc->destroy) - fuse_send_destroy(fc); + down_write(&fc->killsb); + list_del_init(&fm->fc_entry); + if (list_empty(&fc->mounts)) + last = true; + up_write(&fc->killsb); - fuse_abort_conn(fc); - fuse_wait_aborted(fc); + return last; +} +EXPORT_SYMBOL_GPL(fuse_mount_remove); - down_write(&fc->killsb); - fc->sb = NULL; - up_write(&fc->killsb); +void fuse_conn_destroy(struct fuse_mount *fm) +{ + struct fuse_conn *fc = fm->fc; + + if (fc->destroy) + fuse_send_destroy(fm); + + fuse_abort_conn(fc); + fuse_wait_aborted(fc); + + if (!list_empty(&fc->entry)) { + mutex_lock(&fuse_mutex); + list_del(&fc->entry); + fuse_ctl_remove_conn(fc); + mutex_unlock(&fuse_mutex); } } +EXPORT_SYMBOL_GPL(fuse_conn_destroy); -void fuse_kill_sb_anon(struct super_block *sb) +static void fuse_kill_sb_anon(struct super_block *sb) { - fuse_sb_destroy(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); + bool last; + + if (fm) { + last = fuse_mount_remove(fm); + if (last) + fuse_conn_destroy(fm); + } kill_anon_super(sb); } -EXPORT_SYMBOL_GPL(fuse_kill_sb_anon); static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, @@ -1375,7 +1583,14 @@ MODULE_ALIAS_FS("fuse"); #ifdef CONFIG_BLOCK static void fuse_kill_sb_blk(struct super_block *sb) { - fuse_sb_destroy(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); + bool last; + + if (fm) { + last = fuse_mount_remove(fm); + if (last) + fuse_conn_destroy(fm); + } kill_block_super(sb); } diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 90e3f01bd796..3b5e91045871 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -252,7 +252,7 @@ retry: static void fuse_force_forget(struct file *file, u64 nodeid) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_forget_in inarg; FUSE_ARGS(args); @@ -266,7 +266,7 @@ static void fuse_force_forget(struct file *file, u64 nodeid) args.force = true; args.noreply = true; - fuse_simple_request(fc, &args); + fuse_simple_request(fm, &args); /* ignore errors */ } @@ -320,7 +320,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) ssize_t res; struct page *page; struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_io_args ia = {}; struct fuse_args_pages *ap = &ia.ap; struct fuse_page_desc desc = { .length = PAGE_SIZE }; @@ -337,7 +337,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) ap->pages = &page; ap->descs = &desc; if (plus) { - attr_version = fuse_get_attr_version(fc); + attr_version = fuse_get_attr_version(fm->fc); fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, FUSE_READDIRPLUS); } else { @@ -345,7 +345,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) FUSE_READDIR); } locked = fuse_lock_inode(inode); - res = fuse_simple_request(fc, &ap->args); + res = fuse_simple_request(fm, &ap->args); fuse_unlock_inode(inode, locked); if (res >= 0) { if (!res) { diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 104f35de5270..21a9e534417c 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -5,12 +5,17 @@ */ #include <linux/fs.h> +#include <linux/dax.h> +#include <linux/pci.h> +#include <linux/pfn_t.h> #include <linux/module.h> #include <linux/virtio.h> #include <linux/virtio_fs.h> #include <linux/delay.h> #include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/highmem.h> +#include <linux/uio.h> #include "fuse_i.h" /* List of virtio-fs device instances and a lock for the list. Also provides @@ -24,6 +29,8 @@ enum { VQ_REQUEST }; +#define VQ_NAME_LEN 24 + /* Per-virtqueue state */ struct virtio_fs_vq { spinlock_t lock; @@ -36,7 +43,7 @@ struct virtio_fs_vq { bool connected; long in_flight; struct completion in_flight_zero; /* No inflight requests */ - char name[24]; + char name[VQ_NAME_LEN]; } ____cacheline_aligned_in_smp; /* A virtio-fs device instance */ @@ -47,6 +54,12 @@ struct virtio_fs { struct virtio_fs_vq *vqs; unsigned int nvqs; /* number of virtqueues */ unsigned int num_request_queues; /* number of request queues */ + struct dax_device *dax_dev; + + /* DAX memory window where file contents are mapped */ + void *window_kaddr; + phys_addr_t window_phys_addr; + size_t window_len; }; struct virtio_fs_forget_req { @@ -69,6 +82,44 @@ struct virtio_fs_req_work { static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, struct fuse_req *req, bool in_flight); +enum { + OPT_DAX, +}; + +static const struct fs_parameter_spec virtio_fs_parameters[] = { + fsparam_flag("dax", OPT_DAX), + {} +}; + +static int virtio_fs_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct fs_parse_result result; + struct fuse_fs_context *ctx = fc->fs_private; + int opt; + + opt = fs_parse(fc, virtio_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case OPT_DAX: + ctx->dax = 1; + break; + default: + return -EINVAL; + } + + return 0; +} + +static void virtio_fs_free_fc(struct fs_context *fc) +{ + struct fuse_fs_context *ctx = fc->fs_private; + + kfree(ctx); +} + static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq) { struct virtio_fs *fs = vq->vdev->priv; @@ -289,7 +340,6 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) struct fuse_req *req; struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, dispatch_work.work); - struct fuse_conn *fc = fsvq->fud->fc; int ret; pr_debug("virtio-fs: worker %s called.\n", __func__); @@ -304,7 +354,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) list_del_init(&req->list); spin_unlock(&fsvq->lock); - fuse_request_end(fc, req); + fuse_request_end(req); } /* Dispatch pending requests */ @@ -335,7 +385,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) spin_unlock(&fsvq->lock); pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret); - fuse_request_end(fc, req); + fuse_request_end(req); } } } @@ -495,7 +545,6 @@ static void virtio_fs_request_complete(struct fuse_req *req, struct virtio_fs_vq *fsvq) { struct fuse_pqueue *fpq = &fsvq->fud->pq; - struct fuse_conn *fc = fsvq->fud->fc; struct fuse_args *args; struct fuse_args_pages *ap; unsigned int len, i, thislen; @@ -528,7 +577,7 @@ static void virtio_fs_request_complete(struct fuse_req *req, clear_bit(FR_SENT, &req->flags); spin_unlock(&fpq->lock); - fuse_request_end(fc, req); + fuse_request_end(req); spin_lock(&fsvq->lock); dec_in_flight_req(fsvq); spin_unlock(&fsvq->lock); @@ -596,6 +645,26 @@ static void virtio_fs_vq_done(struct virtqueue *vq) schedule_work(&fsvq->done_work); } +static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name, + int vq_type) +{ + strncpy(fsvq->name, name, VQ_NAME_LEN); + spin_lock_init(&fsvq->lock); + INIT_LIST_HEAD(&fsvq->queued_reqs); + INIT_LIST_HEAD(&fsvq->end_reqs); + init_completion(&fsvq->in_flight_zero); + + if (vq_type == VQ_REQUEST) { + INIT_WORK(&fsvq->done_work, virtio_fs_requests_done_work); + INIT_DELAYED_WORK(&fsvq->dispatch_work, + virtio_fs_request_dispatch_work); + } else { + INIT_WORK(&fsvq->done_work, virtio_fs_hiprio_done_work); + INIT_DELAYED_WORK(&fsvq->dispatch_work, + virtio_fs_hiprio_dispatch_work); + } +} + /* Initialize virtqueues */ static int virtio_fs_setup_vqs(struct virtio_device *vdev, struct virtio_fs *fs) @@ -611,7 +680,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, if (fs->num_request_queues == 0) return -EINVAL; - fs->nvqs = 1 + fs->num_request_queues; + fs->nvqs = VQ_REQUEST + fs->num_request_queues; fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL); if (!fs->vqs) return -ENOMEM; @@ -625,29 +694,17 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, goto out; } + /* Initialize the hiprio/forget request virtqueue */ callbacks[VQ_HIPRIO] = virtio_fs_vq_done; - snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name), - "hiprio"); + virtio_fs_init_vq(&fs->vqs[VQ_HIPRIO], "hiprio", VQ_HIPRIO); names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name; - INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work); - INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs); - INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].end_reqs); - INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work, - virtio_fs_hiprio_dispatch_work); - init_completion(&fs->vqs[VQ_HIPRIO].in_flight_zero); - spin_lock_init(&fs->vqs[VQ_HIPRIO].lock); /* Initialize the requests virtqueues */ for (i = VQ_REQUEST; i < fs->nvqs; i++) { - spin_lock_init(&fs->vqs[i].lock); - INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work); - INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work, - virtio_fs_request_dispatch_work); - INIT_LIST_HEAD(&fs->vqs[i].queued_reqs); - INIT_LIST_HEAD(&fs->vqs[i].end_reqs); - init_completion(&fs->vqs[i].in_flight_zero); - snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name), - "requests.%u", i - VQ_REQUEST); + char vq_name[VQ_NAME_LEN]; + + snprintf(vq_name, VQ_NAME_LEN, "requests.%u", i - VQ_REQUEST); + virtio_fs_init_vq(&fs->vqs[i], vq_name, VQ_REQUEST); callbacks[i] = virtio_fs_vq_done; names[i] = fs->vqs[i].name; } @@ -676,6 +733,130 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, vdev->config->del_vqs(vdev); } +/* Map a window offset to a page frame number. The window offset will have + * been produced by .iomap_begin(), which maps a file offset to a window + * offset. + */ +static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) +{ + struct virtio_fs *fs = dax_get_private(dax_dev); + phys_addr_t offset = PFN_PHYS(pgoff); + size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff; + + if (kaddr) + *kaddr = fs->window_kaddr + offset; + if (pfn) + *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, + PFN_DEV | PFN_MAP); + return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; +} + +static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i) +{ + return copy_from_iter(addr, bytes, i); +} + +static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i) +{ + return copy_to_iter(addr, bytes, i); +} + +static int virtio_fs_zero_page_range(struct dax_device *dax_dev, + pgoff_t pgoff, size_t nr_pages) +{ + long rc; + void *kaddr; + + rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL); + if (rc < 0) + return rc; + memset(kaddr, 0, nr_pages << PAGE_SHIFT); + dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT); + return 0; +} + +static const struct dax_operations virtio_fs_dax_ops = { + .direct_access = virtio_fs_direct_access, + .copy_from_iter = virtio_fs_copy_from_iter, + .copy_to_iter = virtio_fs_copy_to_iter, + .zero_page_range = virtio_fs_zero_page_range, +}; + +static void virtio_fs_cleanup_dax(void *data) +{ + struct dax_device *dax_dev = data; + + kill_dax(dax_dev); + put_dax(dax_dev); +} + +static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) +{ + struct virtio_shm_region cache_reg; + struct dev_pagemap *pgmap; + bool have_cache; + + if (!IS_ENABLED(CONFIG_FUSE_DAX)) + return 0; + + /* Get cache region */ + have_cache = virtio_get_shm_region(vdev, &cache_reg, + (u8)VIRTIO_FS_SHMCAP_ID_CACHE); + if (!have_cache) { + dev_notice(&vdev->dev, "%s: No cache capability\n", __func__); + return 0; + } + + if (!devm_request_mem_region(&vdev->dev, cache_reg.addr, cache_reg.len, + dev_name(&vdev->dev))) { + dev_warn(&vdev->dev, "could not reserve region addr=0x%llx len=0x%llx\n", + cache_reg.addr, cache_reg.len); + return -EBUSY; + } + + dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", cache_reg.len, + cache_reg.addr); + + pgmap = devm_kzalloc(&vdev->dev, sizeof(*pgmap), GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + + pgmap->type = MEMORY_DEVICE_FS_DAX; + + /* Ideally we would directly use the PCI BAR resource but + * devm_memremap_pages() wants its own copy in pgmap. So + * initialize a struct resource from scratch (only the start + * and end fields will be used). + */ + pgmap->range = (struct range) { + .start = (phys_addr_t) cache_reg.addr, + .end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1, + }; + pgmap->nr_range = 1; + + fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap); + if (IS_ERR(fs->window_kaddr)) + return PTR_ERR(fs->window_kaddr); + + fs->window_phys_addr = (phys_addr_t) cache_reg.addr; + fs->window_len = (phys_addr_t) cache_reg.len; + + dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n", + __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len); + + fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops, 0); + if (IS_ERR(fs->dax_dev)) + return PTR_ERR(fs->dax_dev); + + return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, + fs->dax_dev); +} + static int virtio_fs_probe(struct virtio_device *vdev) { struct virtio_fs *fs; @@ -697,6 +878,10 @@ static int virtio_fs_probe(struct virtio_device *vdev) /* TODO vq affinity */ + ret = virtio_fs_setup_dax(vdev, fs); + if (ret < 0) + goto out_vqs; + /* Bring the device online in case the filesystem is mounted and * requests need to be sent before we return. */ @@ -833,18 +1018,37 @@ __releases(fiq->lock) spin_unlock(&fiq->lock); } +/* Count number of scatter-gather elements required */ +static unsigned int sg_count_fuse_pages(struct fuse_page_desc *page_descs, + unsigned int num_pages, + unsigned int total_len) +{ + unsigned int i; + unsigned int this_len; + + for (i = 0; i < num_pages && total_len; i++) { + this_len = min(page_descs[i].length, total_len); + total_len -= this_len; + } + + return i; +} + /* Return the number of scatter-gather list elements required */ static unsigned int sg_count_fuse_req(struct fuse_req *req) { struct fuse_args *args = req->args; struct fuse_args_pages *ap = container_of(args, typeof(*ap), args); - unsigned int total_sgs = 1 /* fuse_in_header */; + unsigned int size, total_sgs = 1 /* fuse_in_header */; if (args->in_numargs - args->in_pages) total_sgs += 1; - if (args->in_pages) - total_sgs += ap->num_pages; + if (args->in_pages) { + size = args->in_args[args->in_numargs - 1].size; + total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, + size); + } if (!test_bit(FR_ISREPLY, &req->flags)) return total_sgs; @@ -854,8 +1058,11 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req) if (args->out_numargs - args->out_pages) total_sgs += 1; - if (args->out_pages) - total_sgs += ap->num_pages; + if (args->out_pages) { + size = args->out_args[args->out_numargs - 1].size; + total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, + size); + } return total_sgs; } @@ -1071,24 +1278,28 @@ static const struct fuse_iqueue_ops virtio_fs_fiq_ops = { .release = virtio_fs_fiq_release, }; -static int virtio_fs_fill_super(struct super_block *sb) +static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx) { - struct fuse_conn *fc = get_fuse_conn_super(sb); + ctx->rootmode = S_IFDIR; + ctx->default_permissions = 1; + ctx->allow_other = 1; + ctx->max_read = UINT_MAX; + ctx->blksize = 512; + ctx->destroy = true; + ctx->no_control = true; + ctx->no_force_umount = true; +} + +static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct fuse_conn *fc = fm->fc; struct virtio_fs *fs = fc->iq.priv; + struct fuse_fs_context *ctx = fsc->fs_private; unsigned int i; int err; - struct fuse_fs_context ctx = { - .rootmode = S_IFDIR, - .default_permissions = 1, - .allow_other = 1, - .max_read = UINT_MAX, - .blksize = 512, - .destroy = true, - .no_control = true, - .no_force_umount = true, - .no_mount_options = true, - }; + virtio_fs_ctx_set_defaults(ctx); mutex_lock(&virtio_fs_mutex); /* After holding mutex, make sure virtiofs device is still there. @@ -1112,8 +1323,10 @@ static int virtio_fs_fill_super(struct super_block *sb) } /* virtiofs allocates and installs its own fuse devices */ - ctx.fudptr = NULL; - err = fuse_fill_super_common(sb, &ctx); + ctx->fudptr = NULL; + if (ctx->dax) + ctx->dax_dev = fs->dax_dev; + err = fuse_fill_super_common(sb, ctx); if (err < 0) goto err_free_fuse_devs; @@ -1125,7 +1338,7 @@ static int virtio_fs_fill_super(struct super_block *sb) /* Previous unmount will stop all queues. Start these again */ virtio_fs_start_all_queues(fs); - fuse_send_init(fc); + fuse_send_init(fm); mutex_unlock(&virtio_fs_mutex); return 0; @@ -1136,18 +1349,17 @@ err: return err; } -static void virtio_kill_sb(struct super_block *sb) +static void virtio_fs_conn_destroy(struct fuse_mount *fm) { - struct fuse_conn *fc = get_fuse_conn_super(sb); - struct virtio_fs *vfs; - struct virtio_fs_vq *fsvq; - - /* If mount failed, we can still be called without any fc */ - if (!fc) - return fuse_kill_sb_anon(sb); + struct fuse_conn *fc = fm->fc; + struct virtio_fs *vfs = fc->iq.priv; + struct virtio_fs_vq *fsvq = &vfs->vqs[VQ_HIPRIO]; - vfs = fc->iq.priv; - fsvq = &vfs->vqs[VQ_HIPRIO]; + /* Stop dax worker. Soon evict_inodes() will be called which + * will free all memory ranges belonging to all inodes. + */ + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_cancel_work(fc); /* Stop forget queue. Soon destroy will be sent */ spin_lock(&fsvq->lock); @@ -1155,9 +1367,9 @@ static void virtio_kill_sb(struct super_block *sb) spin_unlock(&fsvq->lock); virtio_fs_drain_all_queues(vfs); - fuse_kill_sb_anon(sb); + fuse_conn_destroy(fm); - /* fuse_kill_sb_anon() must have sent destroy. Stop all queues + /* fuse_conn_destroy() must have sent destroy. Stop all queues * and drain one more time and free fuse devices. Freeing fuse * devices will drop their reference on fuse_conn and that in * turn will drop its reference on virtio_fs object. @@ -1167,12 +1379,27 @@ static void virtio_kill_sb(struct super_block *sb) virtio_fs_free_devs(vfs); } +static void virtio_kill_sb(struct super_block *sb) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + bool last; + + /* If mount failed, we can still be called without any fc */ + if (fm) { + last = fuse_mount_remove(fm); + if (last) + virtio_fs_conn_destroy(fm); + } + kill_anon_super(sb); +} + static int virtio_fs_test_super(struct super_block *sb, struct fs_context *fsc) { - struct fuse_conn *fc = fsc->s_fs_info; + struct fuse_mount *fsc_fm = fsc->s_fs_info; + struct fuse_mount *sb_fm = get_fuse_mount_super(sb); - return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv; + return fsc_fm->fc->iq.priv == sb_fm->fc->iq.priv; } static int virtio_fs_set_super(struct super_block *sb, @@ -1182,7 +1409,7 @@ static int virtio_fs_set_super(struct super_block *sb, err = get_anon_bdev(&sb->s_dev); if (!err) - fuse_conn_get(fsc->s_fs_info); + fuse_mount_get(fsc->s_fs_info); return err; } @@ -1192,6 +1419,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc) struct virtio_fs *fs; struct super_block *sb; struct fuse_conn *fc; + struct fuse_mount *fm; int err; /* This gets a reference on virtio_fs object. This ptr gets installed @@ -1212,19 +1440,29 @@ static int virtio_fs_get_tree(struct fs_context *fsc) return -ENOMEM; } - fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops, - fs); + fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); + if (!fm) { + mutex_lock(&virtio_fs_mutex); + virtio_fs_put(fs); + mutex_unlock(&virtio_fs_mutex); + kfree(fc); + return -ENOMEM; + } + + fuse_conn_init(fc, fm, get_user_ns(current_user_ns()), + &virtio_fs_fiq_ops, fs); fc->release = fuse_free_conn; fc->delete_stale = true; + fc->auto_submounts = true; - fsc->s_fs_info = fc; + fsc->s_fs_info = fm; sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super); - fuse_conn_put(fc); + fuse_mount_put(fm); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { - err = virtio_fs_fill_super(sb); + err = virtio_fs_fill_super(sb, fsc); if (err) { deactivate_locked_super(sb); return err; @@ -1239,11 +1477,19 @@ static int virtio_fs_get_tree(struct fs_context *fsc) } static const struct fs_context_operations virtio_fs_context_ops = { + .free = virtio_fs_free_fc, + .parse_param = virtio_fs_parse_param, .get_tree = virtio_fs_get_tree, }; static int virtio_fs_init_fs_context(struct fs_context *fsc) { + struct fuse_fs_context *ctx; + + ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + fsc->fs_private = ctx; fsc->ops = &virtio_fs_context_ops; return 0; } diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 20d052e08b3b..371bdcbc7233 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -14,12 +14,12 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_setxattr_in inarg; int err; - if (fc->no_setxattr) + if (fm->fc->no_setxattr) return -EOPNOTSUPP; memset(&inarg, 0, sizeof(inarg)); @@ -34,9 +34,9 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, args.in_args[1].value = name; args.in_args[2].size = size; args.in_args[2].value = value; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_setxattr = 1; + fm->fc->no_setxattr = 1; err = -EOPNOTSUPP; } if (!err) { @@ -49,13 +49,13 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, size_t size) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_getxattr_in inarg; struct fuse_getxattr_out outarg; ssize_t ret; - if (fc->no_getxattr) + if (fm->fc->no_getxattr) return -EOPNOTSUPP; memset(&inarg, 0, sizeof(inarg)); @@ -77,11 +77,11 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; } - ret = fuse_simple_request(fc, &args); + ret = fuse_simple_request(fm, &args); if (!ret && !size) ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX); if (ret == -ENOSYS) { - fc->no_getxattr = 1; + fm->fc->no_getxattr = 1; ret = -EOPNOTSUPP; } return ret; @@ -107,16 +107,16 @@ static int fuse_verify_xattr_list(char *list, size_t size) ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) { struct inode *inode = d_inode(entry); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_getxattr_in inarg; struct fuse_getxattr_out outarg; ssize_t ret; - if (!fuse_allow_current_process(fc)) + if (!fuse_allow_current_process(fm->fc)) return -EACCES; - if (fc->no_listxattr) + if (fm->fc->no_listxattr) return -EOPNOTSUPP; memset(&inarg, 0, sizeof(inarg)); @@ -136,13 +136,13 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; } - ret = fuse_simple_request(fc, &args); + ret = fuse_simple_request(fm, &args); if (!ret && !size) ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX); if (ret > 0 && size) ret = fuse_verify_xattr_list(list, ret); if (ret == -ENOSYS) { - fc->no_listxattr = 1; + fm->fc->no_listxattr = 1; ret = -EOPNOTSUPP; } return ret; @@ -150,11 +150,11 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) int fuse_removexattr(struct inode *inode, const char *name) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); int err; - if (fc->no_removexattr) + if (fm->fc->no_removexattr) return -EOPNOTSUPP; args.opcode = FUSE_REMOVEXATTR; @@ -162,9 +162,9 @@ int fuse_removexattr(struct inode *inode, const char *name) args.in_numargs = 1; args.in_args[0].size = strlen(name) + 1; args.in_args[0].value = name; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_removexattr = 1; + fm->fc->no_removexattr = 1; err = -EOPNOTSUPP; } if (!err) { diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index d4af283fc888..9cd2ecad07db 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -91,22 +91,13 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc) struct inode *inode = page->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_SHIFT; - unsigned offset; + struct iomap_writepage_ctx wpc = { }; if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) goto out; if (current->journal_info) goto redirty; - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_SIZE-1); - if (page->index > end_index || (page->index == end_index && !offset)) { - page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); - goto out; - } - - return nobh_writepage(page, gfs2_get_block_noalloc, wbc); + return iomap_writepage(page, wbc, &wpc, &gfs2_writeback_ops); redirty: redirty_page_for_writepage(wbc, page); @@ -115,11 +106,16 @@ out: return 0; } -/* This is the same as calling block_write_full_page, but it also +/** + * gfs2_write_jdata_page - gfs2 jdata-specific version of block_write_full_page + * @page: The page to write + * @wbc: The writeback control + * + * This is the same as calling block_write_full_page, but it also * writes pages outside of i_size */ -static int gfs2_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) +static int gfs2_write_jdata_page(struct page *page, + struct writeback_control *wbc) { struct inode * const inode = page->mapping->host; loff_t i_size = i_size_read(inode); @@ -137,7 +133,7 @@ static int gfs2_write_full_page(struct page *page, get_block_t *get_block, if (page->index == end_index && offset) zero_user_segment(page, offset, PAGE_SIZE); - return __block_write_full_page(inode, page, get_block, wbc, + return __block_write_full_page(inode, page, gfs2_get_block_noalloc, wbc, end_buffer_async_write); } @@ -166,7 +162,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w } gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize); } - return gfs2_write_full_page(page, gfs2_get_block_noalloc, wbc); + return gfs2_write_jdata_page(page, wbc); } /** @@ -208,7 +204,8 @@ static int gfs2_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); - int ret = mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); + struct iomap_writepage_ctx wpc = { }; + int ret; /* * Even if we didn't write any pages here, we might still be holding @@ -216,9 +213,9 @@ static int gfs2_writepages(struct address_space *mapping, * want balance_dirty_pages() to loop indefinitely trying to write out * pages held in the ail that it can't find. */ + ret = iomap_writepages(mapping, wbc, &wpc, &gfs2_writeback_ops); if (ret == 0) set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); - return ret; } @@ -470,12 +467,13 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) static int __gfs2_readpage(void *file, struct page *page) { - struct gfs2_inode *ip = GFS2_I(page->mapping->host); - struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); int error; - if (i_blocksize(page->mapping->host) == PAGE_SIZE && - !page_has_buffers(page)) { + if (!gfs2_is_jdata(ip) || + (i_blocksize(inode) == PAGE_SIZE && !page_has_buffers(page))) { error = iomap_readpage(page, &gfs2_iomap_ops); } else if (gfs2_is_stuffed(ip)) { error = stuffed_readpage(ip, page); @@ -563,8 +561,12 @@ static void gfs2_readahead(struct readahead_control *rac) struct inode *inode = rac->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - if (!gfs2_is_stuffed(ip)) + if (gfs2_is_stuffed(ip)) + ; + else if (gfs2_is_jdata(ip)) mpage_readahead(rac, gfs2_block_map); + else + iomap_readahead(rac, &gfs2_iomap_ops); } /** @@ -621,7 +623,8 @@ out: static int jdata_set_page_dirty(struct page *page) { - SetPageChecked(page); + if (current->journal_info) + SetPageChecked(page); return __set_page_dirty_buffers(page); } @@ -663,8 +666,11 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) if (bd) { if (!list_empty(&bd->bd_list) && !buffer_pinned(bh)) list_del_init(&bd->bd_list); - else + else { + spin_lock(&sdp->sd_ail_lock); gfs2_remove_from_journal(bh, REMOVE_JDATA); + spin_unlock(&sdp->sd_ail_lock); + } } bh->b_bdev = NULL; clear_buffer_mapped(bh); @@ -736,7 +742,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) */ gfs2_log_lock(sdp); - spin_lock(&sdp->sd_ail_lock); head = bh = page_buffers(page); do { if (atomic_read(&bh->b_count)) @@ -748,7 +753,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) goto cannot_release; bh = bh->b_this_page; } while(bh != head); - spin_unlock(&sdp->sd_ail_lock); head = bh = page_buffers(page); do { @@ -774,7 +778,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) return try_to_free_buffers(page); cannot_release: - spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); return 0; } @@ -784,12 +787,13 @@ static const struct address_space_operations gfs2_aops = { .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readahead = gfs2_readahead, + .set_page_dirty = iomap_set_page_dirty, + .releasepage = iomap_releasepage, + .invalidatepage = iomap_invalidatepage, .bmap = gfs2_bmap, - .invalidatepage = gfs2_invalidatepage, - .releasepage = gfs2_releasepage, .direct_IO = noop_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, + .migratepage = iomap_migrate_page, + .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 0f69fbd4af66..8dff9cbd0a87 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -56,7 +56,6 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, u64 block, struct page *page) { struct inode *inode = &ip->i_inode; - struct buffer_head *bh; int release = 0; if (!page || page->index) { @@ -80,20 +79,21 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, SetPageUptodate(page); } - if (!page_has_buffers(page)) - create_empty_buffers(page, BIT(inode->i_blkbits), - BIT(BH_Uptodate)); + if (gfs2_is_jdata(ip)) { + struct buffer_head *bh; - bh = page_buffers(page); + if (!page_has_buffers(page)) + create_empty_buffers(page, BIT(inode->i_blkbits), + BIT(BH_Uptodate)); - if (!buffer_mapped(bh)) - map_bh(bh, inode->i_sb, block); + bh = page_buffers(page); + if (!buffer_mapped(bh)) + map_bh(bh, inode->i_sb, block); - set_buffer_uptodate(bh); - if (gfs2_is_jdata(ip)) + set_buffer_uptodate(bh); gfs2_trans_add_data(ip->i_gl, bh); - else { - mark_buffer_dirty(bh); + } else { + set_page_dirty(page); gfs2_ordered_add_inode(ip); } @@ -1158,7 +1158,8 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, struct metapath mp = { .mp_aheight = 1, }; int ret; - iomap->flags |= IOMAP_F_BUFFER_HEAD; + if (gfs2_is_jdata(ip)) + iomap->flags |= IOMAP_F_BUFFER_HEAD; trace_gfs2_iomap_start(ip, pos, length, flags); if (gfs2_iomap_need_write_lock(flags)) { @@ -1291,6 +1292,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, loff_t length = bh_map->b_size; struct metapath mp = { .mp_aheight = 1, }; struct iomap iomap = { }; + int flags = create ? IOMAP_WRITE : 0; int ret; clear_buffer_mapped(bh_map); @@ -1298,15 +1300,14 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, clear_buffer_boundary(bh_map); trace_gfs2_bmap(ip, bh_map, lblock, create, 1); - if (create) { - ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, &iomap, &mp); - if (!ret && iomap.type == IOMAP_HOLE) + ret = gfs2_iomap_get(inode, pos, length, flags, &iomap, &mp); + if (!ret && iomap.type == IOMAP_HOLE) { + if (create) ret = gfs2_iomap_alloc(inode, &iomap, &mp); - release_metapath(&mp); - } else { - ret = gfs2_iomap_get(inode, pos, length, 0, &iomap, &mp); - release_metapath(&mp); + else + ret = -ENODATA; } + release_metapath(&mp); if (ret) goto out; @@ -2518,3 +2519,26 @@ out: gfs2_trans_end(sdp); return error; } + +static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode, + loff_t offset) +{ + struct metapath mp = { .mp_aheight = 1, }; + int ret; + + if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode)))) + return -EIO; + + if (offset >= wpc->iomap.offset && + offset < wpc->iomap.offset + wpc->iomap.length) + return 0; + + memset(&wpc->iomap, 0, sizeof(wpc->iomap)); + ret = gfs2_iomap_get(inode, offset, INT_MAX, 0, &wpc->iomap, &mp); + release_metapath(&mp); + return ret; +} + +const struct iomap_writeback_ops gfs2_writeback_ops = { + .map_blocks = gfs2_map_blocks, +}; diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index b88fd45ab79f..aed4632d47d3 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -44,6 +44,7 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, } extern const struct iomap_ops gfs2_iomap_ops; +extern const struct iomap_writeback_ops gfs2_writeback_ops; extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); extern int gfs2_block_map(struct inode *inode, sector_t lblock, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index f13b136654ca..5441c17562c5 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -270,7 +270,12 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) gfs2_glock_remove_from_lru(gl); spin_unlock(&gl->gl_lockref.lock); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); - GLOCK_BUG_ON(gl, mapping && mapping->nrpages && !gfs2_withdrawn(sdp)); + if (mapping) { + truncate_inode_pages_final(mapping); + if (!gfs2_withdrawn(sdp)) + GLOCK_BUG_ON(gl, mapping->nrpages || + mapping->nrexceptional); + } trace_gfs2_glock_put(gl); sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } @@ -453,9 +458,6 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) else gl->gl_lockref.count--; } - if (held1 && held2 && list_empty(&gl->gl_holders)) - clear_bit(GLF_QUEUED, &gl->gl_flags); - if (new_state != gl->gl_target) /* shorten our minimum hold time */ gl->gl_hold_time = max(gl->gl_hold_time - GL_GLOCK_HOLD_DECR, @@ -1049,7 +1051,8 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_object = NULL; gl->gl_hold_time = GL_GLOCK_DFT_HOLD; INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); - INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func); + if (gl->gl_name.ln_type == LM_TYPE_IOPEN) + INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func); mapping = gfs2_glock2aspace(gl); if (mapping) { @@ -1345,7 +1348,6 @@ fail: if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) insert_pt = &gh2->gh_list; } - set_bit(GLF_QUEUED, &gl->gl_flags); trace_gfs2_glock_queue(gh, 1); gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); @@ -1646,16 +1648,15 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) unsigned long now = jiffies; gfs2_glock_hold(gl); + spin_lock(&gl->gl_lockref.lock); holdtime = gl->gl_tchange + gl->gl_hold_time; - if (test_bit(GLF_QUEUED, &gl->gl_flags) && + if (!list_empty(&gl->gl_holders) && gl->gl_name.ln_type == LM_TYPE_INODE) { if (time_before(now, holdtime)) delay = holdtime - now; if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) delay = gl->gl_hold_time; } - - spin_lock(&gl->gl_lockref.lock); handle_callback(gl, state, delay, true); __gfs2_glock_queue_work(gl, delay); spin_unlock(&gl->gl_lockref.lock); @@ -1842,10 +1843,9 @@ static struct shrinker glock_shrinker = { }; /** - * examine_bucket - Call a function for glock in a hash bucket + * glock_hash_walk - Call a function for glock in a hash bucket * @examiner: the function * @sdp: the filesystem - * @bucket: the bucket * * Note that the function can be called multiple times on the same * object. So the user must ensure that the function can cope with @@ -1901,9 +1901,11 @@ bool gfs2_delete_work_queued(const struct gfs2_glock *gl) static void flush_delete_work(struct gfs2_glock *gl) { - if (cancel_delayed_work(&gl->gl_delete)) { - queue_delayed_work(gfs2_delete_workqueue, - &gl->gl_delete, 0); + if (gl->gl_name.ln_type == LM_TYPE_IOPEN) { + if (cancel_delayed_work(&gl->gl_delete)) { + queue_delayed_work(gfs2_delete_workqueue, + &gl->gl_delete, 0); + } } gfs2_glock_queue_work(gl, 0); } @@ -2100,7 +2102,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'I'; if (test_bit(GLF_FROZEN, gflags)) *p++ = 'F'; - if (test_bit(GLF_QUEUED, gflags)) + if (!list_empty(&gl->gl_holders)) *p++ = 'q'; if (test_bit(GLF_LRU, gflags)) *p++ = 'L'; @@ -2415,7 +2417,7 @@ static const struct seq_operations gfs2_glstats_seq_ops = { .show = gfs2_glstats_seq_show, }; -static const struct seq_operations gfs2_sbstats_seq_ops = { +static const struct seq_operations gfs2_sbstats_sops = { .start = gfs2_sbstats_seq_start, .next = gfs2_sbstats_seq_next, .stop = gfs2_sbstats_seq_stop, @@ -2468,16 +2470,6 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file) return __gfs2_glocks_open(inode, file, &gfs2_glstats_seq_ops); } -static int gfs2_sbstats_open(struct inode *inode, struct file *file) -{ - int ret = seq_open(file, &gfs2_sbstats_seq_ops); - if (ret == 0) { - struct seq_file *seq = file->private_data; - seq->private = inode->i_private; /* sdp */ - } - return ret; -} - static const struct file_operations gfs2_glocks_fops = { .owner = THIS_MODULE, .open = gfs2_glocks_open, @@ -2494,13 +2486,7 @@ static const struct file_operations gfs2_glstats_fops = { .release = gfs2_glocks_release, }; -static const struct file_operations gfs2_sbstats_fops = { - .owner = THIS_MODULE, - .open = gfs2_sbstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(gfs2_sbstats); void gfs2_create_debugfs_file(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index de1d5f1d9ff8..aa3f5236befb 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -178,6 +178,9 @@ static int rgrp_go_sync(struct gfs2_glock *gl) struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = &sdp->sd_aspace; struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); + const unsigned bsize = sdp->sd_sb.sb_bsize; + loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK; + loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; int error; if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) @@ -186,18 +189,13 @@ static int rgrp_go_sync(struct gfs2_glock *gl) gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_RGRP_GO_SYNC); - filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end); - error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end); - WARN_ON_ONCE(error); + filemap_fdatawrite_range(mapping, start, end); + error = filemap_fdatawait_range(mapping, start, end); + WARN_ON_ONCE(error && !gfs2_withdrawn(sdp)); mapping_set_error(mapping, error); if (!error) error = gfs2_ail_empty_gl(gl); - - spin_lock(&gl->gl_lockref.lock); - rgd = gl->gl_object; - if (rgd) - gfs2_free_clones(rgd); - spin_unlock(&gl->gl_lockref.lock); + gfs2_free_clones(rgd); return error; } @@ -216,15 +214,23 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = &sdp->sd_aspace; struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); + const unsigned bsize = sdp->sd_sb.sb_bsize; + loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK; + loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; - if (rgd) - gfs2_rgrp_brelse(rgd); - + gfs2_rgrp_brelse(rgd); WARN_ON_ONCE(!(flags & DIO_METADATA)); - truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + truncate_inode_pages_range(mapping, start, end); + rgd->rd_flags &= ~GFS2_RDF_UPTODATE; +} + +static void gfs2_rgrp_go_dump(struct seq_file *seq, struct gfs2_glock *gl, + const char *fs_id_buf) +{ + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); if (rgd) - rgd->rd_flags &= ~GFS2_RDF_UPTODATE; + gfs2_rgrp_dump(seq, rgd, fs_id_buf); } static struct gfs2_inode *gfs2_glock2inode(struct gfs2_glock *gl) @@ -712,7 +718,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_sync = rgrp_go_sync, .go_inval = rgrp_go_inval, .go_lock = gfs2_rgrp_go_lock, - .go_dump = gfs2_rgrp_dump, + .go_dump = gfs2_rgrp_go_dump, .go_type = LM_TYPE_RGRP, .go_flags = GLOF_LVB, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index ca2ec02436ec..d7707307f4b1 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -41,6 +41,10 @@ struct gfs2_log_header_host { u32 lh_flags; /* GFS2_LOG_HEAD_... */ u32 lh_tail; /* Block number of log tail */ u32 lh_blkno; + + s64 lh_local_total; + s64 lh_local_free; + s64 lh_local_dinodes; }; /* @@ -340,7 +344,6 @@ enum { GLF_REPLY_PENDING = 9, GLF_INITIAL = 10, GLF_FROZEN = 11, - GLF_QUEUED = 12, GLF_LRU = 13, GLF_OBJECT = 14, /* Used only for tracing */ GLF_BLOCKING = 15, @@ -378,17 +381,10 @@ struct gfs2_glock { atomic_t gl_ail_count; atomic_t gl_revokes; struct delayed_work gl_work; - union { - /* For iopen glocks only */ - struct { - struct delayed_work gl_delete; - u64 gl_no_formal_ino; - }; - /* For rgrp glocks only */ - struct { - loff_t start; - loff_t end; - } gl_vm; + /* For iopen glocks only */ + struct { + struct delayed_work gl_delete; + u64 gl_no_formal_ino; }; struct rcu_head gl_rcu; struct rhash_head gl_node; @@ -701,10 +697,18 @@ struct gfs2_pcpu_lkstats { struct gfs2_lkstats lkstats[10]; }; +/* List of local (per node) statfs inodes */ +struct local_statfs_inode { + struct list_head si_list; + struct inode *si_sc_inode; + unsigned int si_jid; /* journal id this statfs inode corresponds to */ +}; + struct gfs2_sbd { struct super_block *sd_vfs; struct gfs2_pcpu_lkstats __percpu *sd_lkstats; struct kobject sd_kobj; + struct completion sd_kobj_unregister; unsigned long sd_flags; /* SDF_... */ struct gfs2_sb_host sd_sb; @@ -751,6 +755,7 @@ struct gfs2_sbd { struct inode *sd_jindex; struct inode *sd_statfs_inode; struct inode *sd_sc_inode; + struct list_head sd_sc_inodes_list; struct inode *sd_qc_inode; struct inode *sd_rindex; struct inode *sd_quota_inode; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 3763c9ff1406..9133b3178677 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -70,7 +70,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct) * */ -static void gfs2_remove_from_ail(struct gfs2_bufdata *bd) +void gfs2_remove_from_ail(struct gfs2_bufdata *bd) { bd->bd_tr = NULL; list_del_init(&bd->bd_ail_st_list); @@ -244,13 +244,15 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp) * @tr: the transaction * @max_revokes: If nonzero, issue revokes for the bd items for written buffers * + * returns: the transaction's count of remaining active items */ -static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, +static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, int *max_revokes) { struct gfs2_bufdata *bd, *s; struct buffer_head *bh; + int active_count = 0; list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) { @@ -265,8 +267,10 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, * If the ail buffer is not busy and caught an error, flag it * for others. */ - if (!sdp->sd_log_error && buffer_busy(bh)) + if (!sdp->sd_log_error && buffer_busy(bh)) { + active_count++; continue; + } if (!buffer_uptodate(bh) && !cmpxchg(&sdp->sd_log_error, 0, -EIO)) { gfs2_io_error_bh(sdp, bh); @@ -285,6 +289,7 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, } list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); } + return active_count; } /** @@ -303,8 +308,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes) spin_lock(&sdp->sd_ail_lock); list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { - gfs2_ail1_empty_one(sdp, tr, &max_revokes); - if (list_empty(&tr->tr_ail1_list) && oldest_tr) + if (!gfs2_ail1_empty_one(sdp, tr, &max_revokes) && oldest_tr) list_move(&tr->tr_list, &sdp->sd_ail2_list); else oldest_tr = 0; @@ -716,16 +720,24 @@ void gfs2_write_revokes(struct gfs2_sbd *sdp) atomic_dec(&sdp->sd_log_blks_free); /* If no blocks have been reserved, we need to also * reserve a block for the header */ - if (!sdp->sd_log_blks_reserved) + if (!sdp->sd_log_blks_reserved) { atomic_dec(&sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, -2); + } else { + trace_gfs2_log_blocks(sdp, -1); + } } gfs2_ail1_empty(sdp, max_revokes); gfs2_log_unlock(sdp); if (!sdp->sd_log_num_revoke) { atomic_inc(&sdp->sd_log_blks_free); - if (!sdp->sd_log_blks_reserved) + if (!sdp->sd_log_blks_reserved) { atomic_inc(&sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, 2); + } else { + trace_gfs2_log_blocks(sdp, 1); + } } } @@ -902,7 +914,7 @@ static void empty_ail1_list(struct gfs2_sbd *sdp) } /** - * drain_bd - drain the buf and databuf queue for a failed transaction + * trans_drain - drain the buf and databuf queue for a failed transaction * @tr: the transaction to drain * * When this is called, we're taking an error exit for a log write that failed @@ -954,10 +966,8 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) goto out; /* Log might have been flushed while we waited for the flush lock */ - if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) { - up_write(&sdp->sd_log_flush_lock); - return; - } + if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) + goto out; trace_gfs2_log_flush(sdp, 1, flags); if (flags & GFS2_LOG_HEAD_FLUSH_SHUTDOWN) @@ -971,25 +981,25 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) if (unlikely (state == SFS_FROZEN)) if (gfs2_assert_withdraw_delayed(sdp, !tr->tr_num_buf_new && !tr->tr_num_databuf_new)) - goto out; + goto out_withdraw; } if (unlikely(state == SFS_FROZEN)) if (gfs2_assert_withdraw_delayed(sdp, !sdp->sd_log_num_revoke)) - goto out; + goto out_withdraw; if (gfs2_assert_withdraw_delayed(sdp, sdp->sd_log_num_revoke == sdp->sd_log_committed_revoke)) - goto out; + goto out_withdraw; gfs2_ordered_write(sdp); if (gfs2_withdrawn(sdp)) - goto out; + goto out_withdraw; lops_before_commit(sdp, tr); if (gfs2_withdrawn(sdp)) - goto out; + goto out_withdraw; gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE); if (gfs2_withdrawn(sdp)) - goto out; + goto out_withdraw; if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); @@ -1000,7 +1010,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) log_write_header(sdp, flags); } if (gfs2_withdrawn(sdp)) - goto out; + goto out_withdraw; lops_after_commit(sdp, tr); gfs2_log_lock(sdp); @@ -1020,7 +1030,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) if (!sdp->sd_log_idle) { empty_ail1_list(sdp); if (gfs2_withdrawn(sdp)) - goto out; + goto out_withdraw; atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ trace_gfs2_log_blocks(sdp, -1); log_write_header(sdp, flags); @@ -1033,27 +1043,30 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) atomic_set(&sdp->sd_freeze_state, SFS_FROZEN); } -out: - if (gfs2_withdrawn(sdp)) { - trans_drain(tr); - /** - * If the tr_list is empty, we're withdrawing during a log - * flush that targets a transaction, but the transaction was - * never queued onto any of the ail lists. Here we add it to - * ail1 just so that ail_drain() will find and free it. - */ - spin_lock(&sdp->sd_ail_lock); - if (tr && list_empty(&tr->tr_list)) - list_add(&tr->tr_list, &sdp->sd_ail1_list); - spin_unlock(&sdp->sd_ail_lock); - ail_drain(sdp); /* frees all transactions */ - tr = NULL; - } - +out_end: trace_gfs2_log_flush(sdp, 0, flags); +out: up_write(&sdp->sd_log_flush_lock); - gfs2_trans_free(sdp, tr); + if (gfs2_withdrawing(sdp)) + gfs2_withdraw(sdp); + return; + +out_withdraw: + trans_drain(tr); + /** + * If the tr_list is empty, we're withdrawing during a log + * flush that targets a transaction, but the transaction was + * never queued onto any of the ail lists. Here we add it to + * ail1 just so that ail_drain() will find and free it. + */ + spin_lock(&sdp->sd_ail_lock); + if (tr && list_empty(&tr->tr_list)) + list_add(&tr->tr_list, &sdp->sd_ail1_list); + spin_unlock(&sdp->sd_ail_lock); + ail_drain(sdp); /* frees all transactions */ + tr = NULL; + goto out_end; } /** diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 8965c751a303..79f97290146e 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -63,7 +63,7 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct); - +extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index ed1da4323967..ed69298dd824 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -823,7 +823,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, * */ -static void gfs2_meta_sync(struct gfs2_glock *gl) +void gfs2_meta_sync(struct gfs2_glock *gl) { struct address_space *mapping = gfs2_glock2aspace(gl); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 9c5e4e491e03..4a3d8aecdf82 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -27,6 +27,7 @@ extern void gfs2_log_submit_bio(struct bio **biop, int opf); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, bool keep_cache); +extern void gfs2_meta_sync(struct gfs2_glock *gl); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 9856cc2e0795..2db573e31f78 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -348,38 +348,109 @@ void gfs2_remove_from_journal(struct buffer_head *bh, int meta) brelse(bh); } if (bd) { - spin_lock(&sdp->sd_ail_lock); if (bd->bd_tr) { gfs2_trans_add_revoke(sdp, bd); } else if (was_pinned) { bh->b_private = NULL; kmem_cache_free(gfs2_bufdata_cachep, bd); + } else if (!list_empty(&bd->bd_ail_st_list) && + !list_empty(&bd->bd_ail_gl_list)) { + gfs2_remove_from_ail(bd); } - spin_unlock(&sdp->sd_ail_lock); } clear_buffer_dirty(bh); clear_buffer_uptodate(bh); } /** - * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore + * gfs2_ail1_wipe - remove deleted/freed buffers from the ail1 list + * @sdp: superblock + * @bstart: starting block address of buffers to remove + * @blen: length of buffers to be removed + * + * This function is called from gfs2_journal wipe, whose job is to remove + * buffers, corresponding to deleted blocks, from the journal. If we find any + * bufdata elements on the system ail1 list, they haven't been written to + * the journal yet. So we remove them. + */ +static void gfs2_ail1_wipe(struct gfs2_sbd *sdp, u64 bstart, u32 blen) +{ + struct gfs2_trans *tr, *s; + struct gfs2_bufdata *bd, *bs; + struct buffer_head *bh; + u64 end = bstart + blen; + + gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry_safe(tr, s, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry_safe(bd, bs, &tr->tr_ail1_list, + bd_ail_st_list) { + bh = bd->bd_bh; + if (bh->b_blocknr < bstart || bh->b_blocknr >= end) + continue; + + gfs2_remove_from_journal(bh, REMOVE_JDATA); + } + } + spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); +} + +static struct buffer_head *gfs2_getjdatabuf(struct gfs2_inode *ip, u64 blkno) +{ + struct address_space *mapping = ip->i_inode.i_mapping; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct page *page; + struct buffer_head *bh; + unsigned int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift; + unsigned long index = blkno >> shift; /* convert block to page */ + unsigned int bufnum = blkno - (index << shift); + + page = find_get_page_flags(mapping, index, FGP_LOCK|FGP_ACCESSED); + if (!page) + return NULL; + if (!page_has_buffers(page)) { + unlock_page(page); + put_page(page); + return NULL; + } + /* Locate header for our buffer within our page */ + for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page) + /* Do nothing */; + get_bh(bh); + unlock_page(page); + put_page(page); + return bh; +} + +/** + * gfs2_journal_wipe - make inode's buffers so they aren't dirty/pinned anymore * @ip: the inode who owns the buffers * @bstart: the first buffer in the run * @blen: the number of buffers in the run * */ -void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) +void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *bh; + int ty; + gfs2_ail1_wipe(sdp, bstart, blen); while (blen) { + ty = REMOVE_META; bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE); + if (!bh && gfs2_is_jdata(ip)) { + bh = gfs2_getjdatabuf(ip, bstart); + ty = REMOVE_JDATA; + } if (bh) { lock_buffer(bh); gfs2_log_lock(sdp); - gfs2_remove_from_journal(bh, REMOVE_META); + spin_lock(&sdp->sd_ail_lock); + gfs2_remove_from_journal(bh, ty); + spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); unlock_buffer(bh); brelse(bh); diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index eafb74e861c6..4a8c01929b79 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -60,7 +60,7 @@ enum { }; extern void gfs2_remove_from_journal(struct buffer_head *bh, int meta); -extern void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); extern int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, struct buffer_head **bhp); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 6d18d2c91add..7a7e3c10a9a9 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -110,6 +110,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) spin_lock_init(&sdp->sd_trunc_lock); spin_lock_init(&sdp->sd_bitmap_lock); + INIT_LIST_HEAD(&sdp->sd_sc_inodes_list); + mapping = &sdp->sd_aspace; address_space_init_once(mapping); @@ -169,15 +171,19 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) return -EINVAL; } - /* If format numbers match exactly, we're done. */ - - if (sb->sb_fs_format == GFS2_FORMAT_FS && - sb->sb_multihost_format == GFS2_FORMAT_MULTI) - return 0; + if (sb->sb_fs_format != GFS2_FORMAT_FS || + sb->sb_multihost_format != GFS2_FORMAT_MULTI) { + fs_warn(sdp, "Unknown on-disk format, unable to mount\n"); + return -EINVAL; + } - fs_warn(sdp, "Unknown on-disk format, unable to mount\n"); + if (sb->sb_bsize < 512 || sb->sb_bsize > PAGE_SIZE || + (sb->sb_bsize & (sb->sb_bsize - 1))) { + pr_warn("Invalid superblock size\n"); + return -EINVAL; + } - return -EINVAL; + return 0; } static void end_bio_io_page(struct bio *bio) @@ -604,6 +610,90 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) return error; } +/** + * init_statfs - look up and initialize master and local (per node) statfs inodes + * @sdp: The GFS2 superblock + * + * This should be called after the jindex is initialized in init_journal() and + * before gfs2_journal_recovery() is called because we need to be able to write + * to these inodes during recovery. + * + * Returns: errno + */ +static int init_statfs(struct gfs2_sbd *sdp) +{ + int error = 0; + struct inode *master = d_inode(sdp->sd_master_dir); + struct inode *pn = NULL; + char buf[30]; + struct gfs2_jdesc *jd; + struct gfs2_inode *ip; + + sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); + if (IS_ERR(sdp->sd_statfs_inode)) { + error = PTR_ERR(sdp->sd_statfs_inode); + fs_err(sdp, "can't read in statfs inode: %d\n", error); + goto fail; + } + + pn = gfs2_lookup_simple(master, "per_node"); + if (IS_ERR(pn)) { + error = PTR_ERR(pn); + fs_err(sdp, "can't find per_node directory: %d\n", error); + goto put_statfs; + } + + /* For each jid, lookup the corresponding local statfs inode in the + * per_node metafs directory and save it in the sdp->sd_sc_inodes_list. */ + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + struct local_statfs_inode *lsi = + kmalloc(sizeof(struct local_statfs_inode), GFP_NOFS); + if (!lsi) { + error = -ENOMEM; + goto free_local; + } + sprintf(buf, "statfs_change%u", jd->jd_jid); + lsi->si_sc_inode = gfs2_lookup_simple(pn, buf); + if (IS_ERR(lsi->si_sc_inode)) { + error = PTR_ERR(lsi->si_sc_inode); + fs_err(sdp, "can't find local \"sc\" file#%u: %d\n", + jd->jd_jid, error); + goto free_local; + } + lsi->si_jid = jd->jd_jid; + if (jd->jd_jid == sdp->sd_jdesc->jd_jid) + sdp->sd_sc_inode = lsi->si_sc_inode; + + list_add_tail(&lsi->si_list, &sdp->sd_sc_inodes_list); + } + + iput(pn); + ip = GFS2_I(sdp->sd_sc_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, + &sdp->sd_sc_gh); + if (error) { + fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); + goto free_local; + } + return 0; + +free_local: + free_local_statfs_inodes(sdp); + iput(pn); +put_statfs: + iput(sdp->sd_statfs_inode); +fail: + return error; +} + +/* Uninitialize and free up memory used by the list of statfs inodes */ +static void uninit_statfs(struct gfs2_sbd *sdp) +{ + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); + free_local_statfs_inodes(sdp); + iput(sdp->sd_statfs_inode); +} + static int init_journal(struct gfs2_sbd *sdp, int undo) { struct inode *master = d_inode(sdp->sd_master_dir); @@ -690,6 +780,11 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) } trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free)); + /* Lookup statfs inodes here so journal recovery can use them. */ + error = init_statfs(sdp); + if (error) + goto fail_jinode_gh; + if (sdp->sd_lockstruct.ls_first) { unsigned int x; for (x = 0; x < sdp->sd_journals; x++) { @@ -698,14 +793,14 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) if (sdp->sd_args.ar_spectator) { error = check_journal_clean(sdp, jd, true); if (error) - goto fail_jinode_gh; + goto fail_statfs; continue; } error = gfs2_recover_journal(jd, true); if (error) { fs_err(sdp, "error recovering journal %u: %d\n", x, error); - goto fail_jinode_gh; + goto fail_statfs; } } @@ -714,7 +809,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) error = gfs2_recover_journal(sdp->sd_jdesc, true); if (error) { fs_err(sdp, "error recovering my journal: %d\n", error); - goto fail_jinode_gh; + goto fail_statfs; } } @@ -725,6 +820,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) INIT_WORK(&sdp->sd_freeze_work, gfs2_freeze_func); return 0; +fail_statfs: + uninit_statfs(sdp); fail_jinode_gh: /* A withdraw may have done dq/uninit so now we need to check it */ if (!sdp->sd_args.ar_spectator && @@ -758,20 +855,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) if (error) goto fail; - /* Read in the master statfs inode */ - sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); - if (IS_ERR(sdp->sd_statfs_inode)) { - error = PTR_ERR(sdp->sd_statfs_inode); - fs_err(sdp, "can't read in statfs inode: %d\n", error); - goto fail_journal; - } - /* Read in the resource index inode */ sdp->sd_rindex = gfs2_lookup_simple(master, "rindex"); if (IS_ERR(sdp->sd_rindex)) { error = PTR_ERR(sdp->sd_rindex); fs_err(sdp, "can't get resource index inode: %d\n", error); - goto fail_statfs; + goto fail_journal; } sdp->sd_rindex_uptodate = 0; @@ -800,8 +889,6 @@ fail_qinode: fail_rindex: gfs2_clear_rgrpd(sdp); iput(sdp->sd_rindex); -fail_statfs: - iput(sdp->sd_statfs_inode); fail_journal: init_journal(sdp, UNDO); fail: @@ -829,14 +916,6 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) return error; } - sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid); - sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf); - if (IS_ERR(sdp->sd_sc_inode)) { - error = PTR_ERR(sdp->sd_sc_inode); - fs_err(sdp, "can't find local \"sc\" file: %d\n", error); - goto fail; - } - sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf); if (IS_ERR(sdp->sd_qc_inode)) { @@ -848,33 +927,21 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) iput(pn); pn = NULL; - ip = GFS2_I(sdp->sd_sc_inode); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, - &sdp->sd_sc_gh); - if (error) { - fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); - goto fail_qc_i; - } - ip = GFS2_I(sdp->sd_qc_inode); error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &sdp->sd_qc_gh); if (error) { fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); - goto fail_ut_gh; + goto fail_qc_i; } return 0; fail_qc_gh: gfs2_glock_dq_uninit(&sdp->sd_qc_gh); -fail_ut_gh: - gfs2_glock_dq_uninit(&sdp->sd_sc_gh); fail_qc_i: iput(sdp->sd_qc_inode); fail_ut_i: - iput(sdp->sd_sc_inode); -fail: iput(pn); return error; } @@ -1062,26 +1129,14 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) } error = init_names(sdp, silent); - if (error) { - /* In this case, we haven't initialized sysfs, so we have to - manually free the sdp. */ - free_sbd(sdp); - sb->s_fs_info = NULL; - return error; - } + if (error) + goto fail_free; snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name); error = gfs2_sys_fs_add(sdp); - /* - * If we hit an error here, gfs2_sys_fs_add will have called function - * kobject_put which causes the sysfs usage count to go to zero, which - * causes sysfs to call function gfs2_sbd_release, which frees sdp. - * Subsequent error paths here will call gfs2_sys_fs_del, which also - * kobject_put to free sdp. - */ if (error) - return error; + goto fail_free; gfs2_create_debugfs_file(sdp); @@ -1179,9 +1234,9 @@ fail_lm: gfs2_lm_unmount(sdp); fail_debug: gfs2_delete_debugfs_file(sdp); - /* gfs2_sys_fs_del must be the last thing we do, since it causes - * sysfs to call function gfs2_sbd_release, which frees sdp. */ gfs2_sys_fs_del(sdp); +fail_free: + free_sbd(sdp); sb->s_fs_info = NULL; return error; } diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 390ea79d682c..b5cbe21efdfb 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -144,6 +144,10 @@ int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, head->lh_tail = be32_to_cpu(lh->lh_tail); head->lh_blkno = be32_to_cpu(lh->lh_blkno); + head->lh_local_total = be64_to_cpu(lh->lh_local_total); + head->lh_local_free = be64_to_cpu(lh->lh_local_free); + head->lh_local_dinodes = be64_to_cpu(lh->lh_local_dinodes); + return 0; } /** @@ -292,6 +296,109 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message); } +/** + * update_statfs_inode - Update the master statfs inode or zero out the local + * statfs inode for a given journal. + * @jd: The journal + * @head: If NULL, @inode is the local statfs inode and we need to zero it out. + * Otherwise, it @head contains the statfs change info that needs to be + * synced to the master statfs inode (pointed to by @inode). + * @inode: statfs inode to update. + */ +static int update_statfs_inode(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + struct inode *inode) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_inode *ip; + struct buffer_head *bh; + struct gfs2_statfs_change_host sc; + int error = 0; + + BUG_ON(!inode); + ip = GFS2_I(inode); + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out; + + spin_lock(&sdp->sd_statfs_spin); + + if (head) { /* Update the master statfs inode */ + gfs2_statfs_change_in(&sc, bh->b_data + sizeof(struct gfs2_dinode)); + sc.sc_total += head->lh_local_total; + sc.sc_free += head->lh_local_free; + sc.sc_dinodes += head->lh_local_dinodes; + gfs2_statfs_change_out(&sc, bh->b_data + sizeof(struct gfs2_dinode)); + + fs_info(sdp, "jid=%u: Updated master statfs Total:%lld, " + "Free:%lld, Dinodes:%lld after change " + "[%+lld,%+lld,%+lld]\n", jd->jd_jid, sc.sc_total, + sc.sc_free, sc.sc_dinodes, head->lh_local_total, + head->lh_local_free, head->lh_local_dinodes); + } else { /* Zero out the local statfs inode */ + memset(bh->b_data + sizeof(struct gfs2_dinode), 0, + sizeof(struct gfs2_statfs_change)); + /* If it's our own journal, reset any in-memory changes too */ + if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) { + memset(&sdp->sd_statfs_local, 0, + sizeof(struct gfs2_statfs_change_host)); + } + } + spin_unlock(&sdp->sd_statfs_spin); + + mark_buffer_dirty(bh); + brelse(bh); + gfs2_meta_sync(ip->i_gl); + +out: + return error; +} + +/** + * recover_local_statfs - Update the master and local statfs changes for this + * journal. + * + * Previously, statfs updates would be read in from the local statfs inode and + * synced to the master statfs inode during recovery. + * + * We now use the statfs updates in the journal head to update the master statfs + * inode instead of reading in from the local statfs inode. To preserve backward + * compatibility with kernels that can't do this, we still need to keep the + * local statfs inode up to date by writing changes to it. At some point in the + * future, we can do away with the local statfs inodes altogether and keep the + * statfs changes solely in the journal. + * + * @jd: the journal + * @head: the journal head + * + * Returns: errno + */ +static void recover_local_statfs(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head) +{ + int error; + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (!head->lh_local_total && !head->lh_local_free + && !head->lh_local_dinodes) /* No change */ + goto zero_local; + + /* First update the master statfs inode with the changes we + * found in the journal. */ + error = update_statfs_inode(jd, head, sdp->sd_statfs_inode); + if (error) + goto out; + +zero_local: + /* Zero out the local statfs inode so any changes in there + * are not re-recovered. */ + error = update_statfs_inode(jd, NULL, + find_local_statfs_inode(sdp, jd->jd_jid)); +out: + return; +} + void gfs2_recover_func(struct work_struct *work) { struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); @@ -411,6 +518,7 @@ void gfs2_recover_func(struct work_struct *work) goto fail_gunlock_thaw; } + recover_local_statfs(jd, &head); clean_journal(jd, &head); up_read(&sdp->sd_log_flush_lock); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 074f228ea839..ee491bb9c1cc 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -878,7 +878,6 @@ static int rgd_insert(struct gfs2_rgrpd *rgd) static int read_rindex_entry(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - const unsigned bsize = sdp->sd_sb.sb_bsize; loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); struct gfs2_rindex buf; int error; @@ -924,9 +923,6 @@ static int read_rindex_entry(struct gfs2_inode *ip) spin_unlock(&sdp->sd_rindex_spin); if (!error) { glock_set_object(rgd->rd_gl, rgd); - rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK; - rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr + - rgd->rd_length) * bsize) - 1; return 0; } @@ -2209,20 +2205,17 @@ static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd, /** * gfs2_rgrp_dump - print out an rgrp * @seq: The iterator - * @gl: The glock in question + * @rgd: The rgrp in question * @fs_id_buf: pointer to file system id (if requested) * */ -void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl, +void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, const char *fs_id_buf) { - struct gfs2_rgrpd *rgd = gl->gl_object; struct gfs2_blkreserv *trs; const struct rb_node *n; - if (rgd == NULL) - return; gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n", fs_id_buf, (unsigned long long)rgd->rd_addr, rgd->rd_flags, @@ -2253,7 +2246,7 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) (unsigned long long)rgd->rd_addr); fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n"); sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname); - gfs2_rgrp_dump(NULL, rgd->rd_gl, fs_id_buf); + gfs2_rgrp_dump(NULL, rgd, fs_id_buf); rgd->rd_flags |= GFS2_RDF_ERROR; } @@ -2445,8 +2438,8 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); /* Directories keep their data in the metadata address space */ - if (meta || ip->i_depth) - gfs2_meta_wipe(ip, bstart, blen); + if (meta || ip->i_depth || gfs2_is_jdata(ip)) + gfs2_journal_wipe(ip, bstart, blen); } /** @@ -2502,7 +2495,7 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) gfs2_statfs_change(sdp, 0, +1, -1); trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE); gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid); - gfs2_meta_wipe(ip, ip->i_no_addr, 1); + gfs2_journal_wipe(ip, ip->i_no_addr, 1); } /** diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index a1d7e14fc55b..9a587ada51ed 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -67,7 +67,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist); extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); -extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl, +extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, const char *fs_id_buf); extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, struct buffer_head *bh, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 9f4d9e7be839..b285192bd6b3 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -44,6 +44,12 @@ #include "xattr.h" #include "lops.h" +enum dinode_demise { + SHOULD_DELETE_DINODE, + SHOULD_NOT_DELETE_DINODE, + SHOULD_DEFER_EVICTION, +}; + /** * gfs2_jindex_free - Clear all the journal index information * @sdp: The GFS2 superblock @@ -224,7 +230,7 @@ void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) sc->sc_dinodes = be64_to_cpu(str->sc_dinodes); } -static void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) +void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) { struct gfs2_statfs_change *str = buf; @@ -702,6 +708,8 @@ restart: if (error) gfs2_io_error(sdp); } + WARN_ON(gfs2_withdrawing(sdp)); + /* At this point, we're through modifying the disk */ /* Release stuff */ @@ -721,7 +729,7 @@ restart: gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); gfs2_glock_dq_uninit(&sdp->sd_sc_gh); gfs2_glock_dq_uninit(&sdp->sd_qc_gh); - iput(sdp->sd_sc_inode); + free_local_statfs_inodes(sdp); iput(sdp->sd_qc_inode); } @@ -736,6 +744,7 @@ restart: /* At this point, we're through participating in the lockspace */ gfs2_sys_fs_del(sdp); + free_sbd(sdp); } /** @@ -1309,109 +1318,98 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode) } /** - * gfs2_evict_inode - Remove an inode from cache + * evict_should_delete - determine whether the inode is eligible for deletion * @inode: The inode to evict * - * There are three cases to consider: - * 1. i_nlink == 0, we are final opener (and must deallocate) - * 2. i_nlink == 0, we are not the final opener (and cannot deallocate) - * 3. i_nlink > 0 - * - * If the fs is read only, then we have to treat all cases as per #3 - * since we are unable to do any deallocation. The inode will be - * deallocated by the next read/write node to attempt an allocation - * in the same resource group + * This function determines whether the evicted inode is eligible to be deleted + * and locks the inode glock. * - * We have to (at the moment) hold the inodes main lock to cover - * the gap between unlocking the shared lock on the iopen lock and - * taking the exclusive lock. I'd rather do a shared -> exclusive - * conversion on the iopen lock, but we can change that later. This - * is safe, just less efficient. + * Returns: the fate of the dinode */ - -static void gfs2_evict_inode(struct inode *inode) +static enum dinode_demise evict_should_delete(struct inode *inode, + struct gfs2_holder *gh) { + struct gfs2_inode *ip = GFS2_I(inode); struct super_block *sb = inode->i_sb; struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - struct address_space *metamapping; - int error; - - if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) { - clear_inode(inode); - return; - } - - if (inode->i_nlink || sb_rdonly(sb)) - goto out; + int ret; if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl)); - gfs2_holder_mark_uninitialized(&gh); - goto out_delete; + goto should_delete; } if (test_bit(GIF_DEFERRED_DELETE, &ip->i_flags)) - goto out; + return SHOULD_DEFER_EVICTION; /* Deletes should never happen under memory pressure anymore. */ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) - goto out; + return SHOULD_DEFER_EVICTION; /* Must not read inode block until block type has been verified */ - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); - if (unlikely(error)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, gh); + if (unlikely(ret)) { glock_clear_object(ip->i_iopen_gh.gh_gl, ip); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); - goto out; + return SHOULD_DEFER_EVICTION; } if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino)) - goto out_truncate; - error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); - if (error) - goto out_truncate; + return SHOULD_NOT_DELETE_DINODE; + ret = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); + if (ret) + return SHOULD_NOT_DELETE_DINODE; if (test_bit(GIF_INVALID, &ip->i_flags)) { - error = gfs2_inode_refresh(ip); - if (error) - goto out_truncate; + ret = gfs2_inode_refresh(ip); + if (ret) + return SHOULD_NOT_DELETE_DINODE; } /* * The inode may have been recreated in the meantime. */ if (inode->i_nlink) - goto out_truncate; + return SHOULD_NOT_DELETE_DINODE; -out_delete: +should_delete: if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { if (!gfs2_upgrade_iopen_glock(inode)) { gfs2_holder_uninit(&ip->i_iopen_gh); - goto out_truncate; + return SHOULD_NOT_DELETE_DINODE; } } + return SHOULD_DELETE_DINODE; +} + +/** + * evict_unlinked_inode - delete the pieces of an unlinked evicted inode + * @inode: The inode to evict + */ +static int evict_unlinked_inode(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + int ret; if (S_ISDIR(inode->i_mode) && (ip->i_diskflags & GFS2_DIF_EXHASH)) { - error = gfs2_dir_exhash_dealloc(ip); - if (error) - goto out_unlock; + ret = gfs2_dir_exhash_dealloc(ip); + if (ret) + goto out; } if (ip->i_eattr) { - error = gfs2_ea_dealloc(ip); - if (error) - goto out_unlock; + ret = gfs2_ea_dealloc(ip); + if (ret) + goto out; } if (!gfs2_is_stuffed(ip)) { - error = gfs2_file_dealloc(ip); - if (error) - goto out_unlock; + ret = gfs2_file_dealloc(ip); + if (ret) + goto out; } /* We're about to clear the bitmap for the dinode, but as soon as we @@ -1419,11 +1417,24 @@ out_delete: location and try to set gl_object again. We clear gl_object here so that subsequent inode creates don't see an old gl_object. */ glock_clear_object(ip->i_gl, ip); - error = gfs2_dinode_dealloc(ip); + ret = gfs2_dinode_dealloc(ip); gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino); - goto out_unlock; +out: + return ret; +} + +/* + * evict_linked_inode - evict an inode whose dinode has not been unlinked + * @inode: The inode to evict + */ +static int evict_linked_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(inode); + struct address_space *metamapping; + int ret; -out_truncate: gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_EVICT_INODE); metamapping = gfs2_glock2aspace(ip->i_gl); @@ -1434,15 +1445,63 @@ out_truncate: write_inode_now(inode, 1); gfs2_ail_flush(ip->i_gl, 0); - error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); - if (error) - goto out_unlock; + ret = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); + if (ret) + return ret; + /* Needs to be done before glock release & also in a transaction */ truncate_inode_pages(&inode->i_data, 0); truncate_inode_pages(metamapping, 0); gfs2_trans_end(sdp); + return 0; +} + +/** + * gfs2_evict_inode - Remove an inode from cache + * @inode: The inode to evict + * + * There are three cases to consider: + * 1. i_nlink == 0, we are final opener (and must deallocate) + * 2. i_nlink == 0, we are not the final opener (and cannot deallocate) + * 3. i_nlink > 0 + * + * If the fs is read only, then we have to treat all cases as per #3 + * since we are unable to do any deallocation. The inode will be + * deallocated by the next read/write node to attempt an allocation + * in the same resource group + * + * We have to (at the moment) hold the inodes main lock to cover + * the gap between unlocking the shared lock on the iopen lock and + * taking the exclusive lock. I'd rather do a shared -> exclusive + * conversion on the iopen lock, but we can change that later. This + * is safe, just less efficient. + */ + +static void gfs2_evict_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) { + clear_inode(inode); + return; + } + + if (inode->i_nlink || sb_rdonly(sb)) + goto out; + + gfs2_holder_mark_uninitialized(&gh); + ret = evict_should_delete(inode, &gh); + if (ret == SHOULD_DEFER_EVICTION) + goto out; + if (ret == SHOULD_DELETE_DINODE) + ret = evict_unlinked_inode(inode); + else + ret = evict_linked_inode(inode); -out_unlock: if (gfs2_rs_active(&ip->i_res)) gfs2_rs_deltree(&ip->i_res); @@ -1450,8 +1509,8 @@ out_unlock: glock_clear_object(ip->i_gl, ip); gfs2_glock_dq_uninit(&gh); } - if (error && error != GLR_TRYFAILED && error != -EROFS) - fs_warn(sdp, "gfs2_evict_inode: %d\n", error); + if (ret && ret != GLR_TRYFAILED && ret != -EROFS) + fs_warn(sdp, "gfs2_evict_inode: %d\n", ret); out: truncate_inode_pages_final(&inode->i_data); if (ip->i_qadata) @@ -1502,6 +1561,35 @@ static void gfs2_free_inode(struct inode *inode) kmem_cache_free(gfs2_inode_cachep, GFS2_I(inode)); } +extern void free_local_statfs_inodes(struct gfs2_sbd *sdp) +{ + struct local_statfs_inode *lsi, *safe; + + /* Run through the statfs inodes list to iput and free memory */ + list_for_each_entry_safe(lsi, safe, &sdp->sd_sc_inodes_list, si_list) { + if (lsi->si_jid == sdp->sd_jdesc->jd_jid) + sdp->sd_sc_inode = NULL; /* belongs to this node */ + if (lsi->si_sc_inode) + iput(lsi->si_sc_inode); + list_del(&lsi->si_list); + kfree(lsi); + } +} + +extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, + unsigned int index) +{ + struct local_statfs_inode *lsi; + + /* Return the local (per node) statfs inode in the + * sdp->sd_sc_inodes_list corresponding to the 'index'. */ + list_for_each_entry(lsi, &sdp->sd_sc_inodes_list, si_list) { + if (lsi->si_jid == index) + return lsi->si_sc_inode; + } + return NULL; +} + const struct super_operations gfs2_super_ops = { .alloc_inode = gfs2_alloc_inode, .free_inode = gfs2_free_inode, diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 51900554ed81..c9fb2a654181 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -37,11 +37,16 @@ extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, s64 dinodes); extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf); +extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, + void *buf); extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, struct buffer_head *l_bh); extern int gfs2_statfs_sync(struct super_block *sb, int type); extern void gfs2_freeze_func(struct work_struct *work); +extern void free_local_statfs_inodes(struct gfs2_sbd *sdp); +extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, + unsigned int index); extern void free_sbd(struct gfs2_sbd *sdp); extern struct file_system_type gfs2_fs_type; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index d28c41bd69b0..c3e72dba7418 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -303,7 +303,7 @@ static void gfs2_sbd_release(struct kobject *kobj) { struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); - free_sbd(sdp); + complete(&sdp->sd_kobj_unregister); } static struct kobj_type gfs2_ktype = { @@ -655,6 +655,7 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) sprintf(ro, "RDONLY=%d", sb_rdonly(sb)); sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); + init_completion(&sdp->sd_kobj_unregister); sdp->sd_kobj.kset = gfs2_kset; error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL, "%s", sdp->sd_table_name); @@ -685,6 +686,7 @@ fail_tune: fail_reg: fs_err(sdp, "error %d adding sysfs files\n", error); kobject_put(&sdp->sd_kobj); + wait_for_completion(&sdp->sd_kobj_unregister); sb->s_fs_info = NULL; return error; } @@ -695,6 +697,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp) sysfs_remove_group(&sdp->sd_kobj, &tune_group); sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); kobject_put(&sdp->sd_kobj); + wait_for_completion(&sdp->sd_kobj_unregister); } static int gfs2_uevent(struct kset *kset, struct kobject *kobj, diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index e0025258107a..0b2f858d9a8c 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -56,7 +56,6 @@ {(1UL << GLF_REPLY_PENDING), "r" }, \ {(1UL << GLF_INITIAL), "I" }, \ {(1UL << GLF_FROZEN), "F" }, \ - {(1UL << GLF_QUEUED), "q" }, \ {(1UL << GLF_LRU), "L" }, \ {(1UL << GLF_OBJECT), "o" }, \ {(1UL << GLF_BLOCKING), "b" }) @@ -388,15 +387,17 @@ TRACE_EVENT(gfs2_log_blocks, TP_STRUCT__entry( __field( dev_t, dev ) __field( int, blocks ) + __field( int, blks_free ) ), TP_fast_assign( __entry->dev = sdp->sd_vfs->s_dev; __entry->blocks = blocks; + __entry->blks_free = atomic_read(&sdp->sd_log_blks_free); ), - TP_printk("%u,%u log reserve %d", MAJOR(__entry->dev), - MINOR(__entry->dev), __entry->blocks) + TP_printk("%u,%u log reserve %d %d", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->blocks, __entry->blks_free) ); /* Writing back the AIL */ diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 1cd0328cae20..0fba3bf64189 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -419,7 +419,7 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, char fs_id_buf[sizeof(sdp->sd_fsname) + 7]; sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname); - gfs2_rgrp_dump(NULL, rgd->rd_gl, fs_id_buf); + gfs2_rgrp_dump(NULL, rgd, fs_id_buf); gfs2_lm(sdp, "fatal: filesystem consistency error\n" " RG = %llu\n" diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 6d9157efe16c..d7562981b3a0 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -205,6 +205,16 @@ static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp) test_bit(SDF_WITHDRAWING, &sdp->sd_flags); } +/** + * gfs2_withdrawing - check if a withdraw is pending + * @sdp: the superblock + */ +static inline bool gfs2_withdrawing(struct gfs2_sbd *sdp) +{ + return test_bit(SDF_WITHDRAWING, &sdp->sd_flags) && + !test_bit(SDF_WITHDRAWN, &sdp->sd_flags); +} + #define gfs2_tune_get(sdp, field) \ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) diff --git a/fs/hfs/super.c b/fs/hfs/super.c index c33324686d89..44d07c9e3a7f 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -104,8 +104,7 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; buf->f_files = HFS_SB(sb)->fs_ablocks; buf->f_ffree = HFS_SB(sb)->free_ablocks; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = HFS_NAMELEN; return 0; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 129dca3f4b78..807119ae5adf 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -320,8 +320,7 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; buf->f_files = 0xFFFFFFFF; buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = HFSPLUS_MAX_STRLEN; return 0; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 0a677a9aaf34..a7dbfc892022 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -192,8 +192,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = sbi->sb_n_free; buf->f_files = sbi->sb_dirband_size / 4; buf->f_ffree = hpfs_get_free_dnodes(s); - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = 254; hpfs_unlock(s); diff --git a/fs/io-wq.c b/fs/io-wq.c index 0a182f1333e8..02894df7656d 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -18,7 +18,10 @@ #include <linux/fs_struct.h> #include <linux/task_work.h> #include <linux/blk-cgroup.h> +#include <linux/audit.h> +#include <linux/cpu.h> +#include "../kernel/sched/sched.h" #include "io-wq.h" #define WORKER_IDLE_TIMEOUT (5 * HZ) @@ -122,9 +125,13 @@ struct io_wq { refcount_t refs; struct completion done; + struct hlist_node cpuhp_node; + refcount_t use_refs; }; +static enum cpuhp_state io_wq_online; + static bool io_worker_get(struct io_worker *worker) { return refcount_inc_not_zero(&worker->ref); @@ -186,7 +193,8 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) worker->blkcg_css = NULL; } #endif - + if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY) + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; return dropped_lock; } @@ -429,14 +437,10 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) mmput(worker->mm); worker->mm = NULL; } - if (!work->mm) - return; - if (mmget_not_zero(work->mm)) { - kthread_use_mm(work->mm); - worker->mm = work->mm; - /* hang on to this mm */ - work->mm = NULL; + if (mmget_not_zero(work->identity->mm)) { + kthread_use_mm(work->identity->mm); + worker->mm = work->identity->mm; return; } @@ -448,9 +452,11 @@ static inline void io_wq_switch_blkcg(struct io_worker *worker, struct io_wq_work *work) { #ifdef CONFIG_BLK_CGROUP - if (work->blkcg_css != worker->blkcg_css) { - kthread_associate_blkcg(work->blkcg_css); - worker->blkcg_css = work->blkcg_css; + if (!(work->flags & IO_WQ_WORK_BLKCG)) + return; + if (work->identity->blkcg_css != worker->blkcg_css) { + kthread_associate_blkcg(work->identity->blkcg_css); + worker->blkcg_css = work->identity->blkcg_css; } #endif } @@ -458,9 +464,9 @@ static inline void io_wq_switch_blkcg(struct io_worker *worker, static void io_wq_switch_creds(struct io_worker *worker, struct io_wq_work *work) { - const struct cred *old_creds = override_creds(work->creds); + const struct cred *old_creds = override_creds(work->identity->creds); - worker->cur_creds = work->creds; + worker->cur_creds = work->identity->creds; if (worker->saved_creds) put_cred(old_creds); /* creds set by previous switch */ else @@ -470,20 +476,29 @@ static void io_wq_switch_creds(struct io_worker *worker, static void io_impersonate_work(struct io_worker *worker, struct io_wq_work *work) { - if (work->files && current->files != work->files) { + if ((work->flags & IO_WQ_WORK_FILES) && + current->files != work->identity->files) { task_lock(current); - current->files = work->files; - current->nsproxy = work->nsproxy; + current->files = work->identity->files; + current->nsproxy = work->identity->nsproxy; task_unlock(current); } - if (work->fs && current->fs != work->fs) - current->fs = work->fs; - if (work->mm != worker->mm) + if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs) + current->fs = work->identity->fs; + if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm) io_wq_switch_mm(worker, work); - if (worker->cur_creds != work->creds) + if ((work->flags & IO_WQ_WORK_CREDS) && + worker->cur_creds != work->identity->creds) io_wq_switch_creds(worker, work); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize; + if (work->flags & IO_WQ_WORK_FSIZE) + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize; + else if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY) + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; io_wq_switch_blkcg(worker, work); +#ifdef CONFIG_AUDIT + current->loginuid = work->identity->loginuid; + current->sessionid = work->identity->sessionid; +#endif } static void io_assign_current_work(struct io_worker *worker, @@ -496,6 +511,11 @@ static void io_assign_current_work(struct io_worker *worker, cond_resched(); } +#ifdef CONFIG_AUDIT + current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET); + current->sessionid = AUDIT_SID_UNSET; +#endif + spin_lock_irq(&worker->lock); worker->cur_work = work; spin_unlock_irq(&worker->lock); @@ -676,6 +696,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) kfree(worker); return false; } + kthread_bind_mask(worker->task, cpumask_of_node(wqe->node)); raw_spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); @@ -1076,10 +1097,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) return ERR_PTR(-ENOMEM); wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL); - if (!wq->wqes) { - kfree(wq); - return ERR_PTR(-ENOMEM); - } + if (!wq->wqes) + goto err_wq; + + ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); + if (ret) + goto err_wqes; wq->free_work = data->free_work; wq->do_work = data->do_work; @@ -1087,6 +1110,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) /* caller must already hold a reference to this */ wq->user = data->user; + ret = -ENOMEM; for_each_node(node) { struct io_wqe *wqe; int alloc_node = node; @@ -1130,9 +1154,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ret = PTR_ERR(wq->manager); complete(&wq->done); err: + cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); for_each_node(node) kfree(wq->wqes[node]); +err_wqes: kfree(wq->wqes); +err_wq: kfree(wq); return ERR_PTR(ret); } @@ -1149,6 +1176,8 @@ static void __io_wq_destroy(struct io_wq *wq) { int node; + cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); + set_bit(IO_WQ_BIT_EXIT, &wq->state); if (wq->manager) kthread_stop(wq->manager); @@ -1176,3 +1205,41 @@ struct task_struct *io_wq_get_task(struct io_wq *wq) { return wq->manager; } + +static bool io_wq_worker_affinity(struct io_worker *worker, void *data) +{ + struct task_struct *task = worker->task; + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(task, &rf); + do_set_cpus_allowed(task, cpumask_of_node(worker->wqe->node)); + task->flags |= PF_NO_SETAFFINITY; + task_rq_unlock(rq, task, &rf); + return false; +} + +static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node) +{ + struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node); + int i; + + rcu_read_lock(); + for_each_node(i) + io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, NULL); + rcu_read_unlock(); + return 0; +} + +static __init int io_wq_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online", + io_wq_cpu_online, NULL); + if (ret < 0) + return ret; + io_wq_online = ret; + return 0; +} +subsys_initcall(io_wq_init); diff --git a/fs/io-wq.h b/fs/io-wq.h index 84bcf6a85523..cba36f03c355 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -1,6 +1,8 @@ #ifndef INTERNAL_IO_WQ_H #define INTERNAL_IO_WQ_H +#include <linux/io_uring.h> + struct io_wq; enum { @@ -10,6 +12,13 @@ enum { IO_WQ_WORK_NO_CANCEL = 8, IO_WQ_WORK_CONCURRENT = 16, + IO_WQ_WORK_FILES = 32, + IO_WQ_WORK_FS = 64, + IO_WQ_WORK_MM = 128, + IO_WQ_WORK_CREDS = 256, + IO_WQ_WORK_BLKCG = 512, + IO_WQ_WORK_FSIZE = 1024, + IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; @@ -85,15 +94,7 @@ static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work { struct io_wq_work_node list; - struct files_struct *files; - struct mm_struct *mm; -#ifdef CONFIG_BLK_CGROUP - struct cgroup_subsys_state *blkcg_css; -#endif - const struct cred *creds; - struct nsproxy *nsproxy; - struct fs_struct *fs; - unsigned long fsize; + struct io_identity *identity; unsigned flags; }; diff --git a/fs/io_uring.c b/fs/io_uring.c index b58169240c77..b42dfa0243bf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -81,6 +81,7 @@ #include <linux/pagemap.h> #include <linux/io_uring.h> #include <linux/blk-cgroup.h> +#include <linux/audit.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -276,7 +277,7 @@ struct io_ring_ctx { unsigned sq_mask; unsigned sq_thread_idle; unsigned cached_sq_dropped; - atomic_t cached_cq_overflow; + unsigned cached_cq_overflow; unsigned long sq_check_overflow; struct list_head defer_list; @@ -327,6 +328,11 @@ struct io_ring_ctx { const struct cred *creds; +#ifdef CONFIG_AUDIT + kuid_t loginuid; + unsigned int sessionid; +#endif + struct completion ref_comp; struct completion sq_thread_comp; @@ -574,12 +580,12 @@ enum { REQ_F_NOWAIT_BIT, REQ_F_LINK_TIMEOUT_BIT, REQ_F_ISREG_BIT, - REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, REQ_F_WORK_INITIALIZED_BIT, + REQ_F_LTIMEOUT_ACTIVE_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -609,12 +615,10 @@ enum { REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), /* must not punt to workers */ REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), - /* has linked timeout */ + /* has or had linked timeout */ REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), - /* completion under lock */ - REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), /* needs cleanup */ REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), /* already went through poll handler */ @@ -625,6 +629,8 @@ enum { REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), /* io_wq_work is initialized */ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), + /* linked timeout is active, i.e. prepared by link's head */ + REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT), }; struct async_poll { @@ -732,8 +738,6 @@ struct io_submit_state { }; struct io_op_def { - /* needs current->mm setup, does mm access */ - unsigned needs_mm : 1; /* needs req->file assigned */ unsigned needs_file : 1; /* don't fail if file grab fails */ @@ -744,67 +748,58 @@ struct io_op_def { unsigned unbound_nonreg_file : 1; /* opcode is not supported by this kernel */ unsigned not_supported : 1; - /* needs file table */ - unsigned file_table : 1; - /* needs ->fs */ - unsigned needs_fs : 1; /* set if opcode supports polled "wait" */ unsigned pollin : 1; unsigned pollout : 1; /* op supports buffer selection */ unsigned buffer_select : 1; - /* needs rlimit(RLIMIT_FSIZE) assigned */ - unsigned needs_fsize : 1; /* must always have async data allocated */ unsigned needs_async_data : 1; - /* needs blkcg context, issues async io potentially */ - unsigned needs_blkcg : 1; /* size of async data needed, if any */ unsigned short async_size; + unsigned work_flags; }; -static const struct io_op_def io_op_defs[] __read_mostly = { +static const struct io_op_def io_op_defs[] = { [IORING_OP_NOP] = {}, [IORING_OP_READV] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, .needs_async_data = 1, - .needs_blkcg = 1, .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_WRITEV] = { - .needs_mm = 1, .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .needs_fsize = 1, .needs_async_data = 1, - .needs_blkcg = 1, .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FSIZE, }, [IORING_OP_FSYNC] = { .needs_file = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_READ_FIXED] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, - .needs_blkcg = 1, .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .needs_fsize = 1, - .needs_blkcg = 1, .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE | + IO_WQ_WORK_MM, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -813,137 +808,122 @@ static const struct io_op_def io_op_defs[] __read_mostly = { [IORING_OP_POLL_REMOVE] = {}, [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_SENDMSG] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, - .needs_fs = 1, .pollout = 1, .needs_async_data = 1, - .needs_blkcg = 1, .async_size = sizeof(struct io_async_msghdr), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FS, }, [IORING_OP_RECVMSG] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, - .needs_fs = 1, .pollin = 1, .buffer_select = 1, .needs_async_data = 1, - .needs_blkcg = 1, .async_size = sizeof(struct io_async_msghdr), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FS, }, [IORING_OP_TIMEOUT] = { - .needs_mm = 1, .needs_async_data = 1, .async_size = sizeof(struct io_timeout_data), + .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_TIMEOUT_REMOVE] = {}, [IORING_OP_ACCEPT] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, - .file_table = 1, .pollin = 1, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES, }, [IORING_OP_ASYNC_CANCEL] = {}, [IORING_OP_LINK_TIMEOUT] = { - .needs_mm = 1, .needs_async_data = 1, .async_size = sizeof(struct io_timeout_data), + .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_CONNECT] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_connect), + .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_FALLOCATE] = { .needs_file = 1, - .needs_fsize = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE, }, [IORING_OP_OPENAT] = { - .file_table = 1, - .needs_fs = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FS, }, [IORING_OP_CLOSE] = { .needs_file = 1, .needs_file_no_error = 1, - .file_table = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG, }, [IORING_OP_FILES_UPDATE] = { - .needs_mm = 1, - .file_table = 1, + .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM, }, [IORING_OP_STATX] = { - .needs_mm = 1, - .needs_fs = 1, - .file_table = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM | + IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, }, [IORING_OP_READ] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, - .needs_blkcg = 1, .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_WRITE] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .needs_fsize = 1, - .needs_blkcg = 1, .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FSIZE, }, [IORING_OP_FADVISE] = { .needs_file = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_MADVISE] = { - .needs_mm = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_SEND] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_RECV] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_OPENAT2] = { - .file_table = 1, - .needs_fs = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS | + IO_WQ_WORK_BLKCG, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, - .file_table = 1, + .work_flags = IO_WQ_WORK_FILES, }, [IORING_OP_SPLICE] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_PROVIDE_BUFFERS] = {}, [IORING_OP_REMOVE_BUFFERS] = {}, @@ -963,8 +943,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, struct io_comp_state *cs); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); +static void io_put_req_deferred(struct io_kiocb *req, int nr); static void io_double_put_req(struct io_kiocb *req); -static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void __io_queue_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); @@ -986,7 +966,7 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, static struct kmem_cache *req_cachep; -static const struct file_operations io_uring_fops __read_mostly; +static const struct file_operations io_uring_fops; struct sock *io_uring_get_socket(struct file *file) { @@ -1034,7 +1014,7 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, struct io_kiocb *req) { - if (!io_op_defs[req->opcode].needs_mm) + if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM)) return 0; return __io_sq_thread_acquire_mm(ctx); } @@ -1066,16 +1046,53 @@ static inline void req_set_fail_links(struct io_kiocb *req) } /* + * None of these are dereferenced, they are simply used to check if any of + * them have changed. If we're under current and check they are still the + * same, we're fine to grab references to them for actual out-of-line use. + */ +static void io_init_identity(struct io_identity *id) +{ + id->files = current->files; + id->mm = current->mm; +#ifdef CONFIG_BLK_CGROUP + rcu_read_lock(); + id->blkcg_css = blkcg_css(); + rcu_read_unlock(); +#endif + id->creds = current_cred(); + id->nsproxy = current->nsproxy; + id->fs = current->fs; + id->fsize = rlimit(RLIMIT_FSIZE); +#ifdef CONFIG_AUDIT + id->loginuid = current->loginuid; + id->sessionid = current->sessionid; +#endif + refcount_set(&id->count, 1); +} + +static inline void __io_req_init_async(struct io_kiocb *req) +{ + memset(&req->work, 0, sizeof(req->work)); + req->flags |= REQ_F_WORK_INITIALIZED; +} + +/* * Note: must call io_req_init_async() for the first time you * touch any members of io_wq_work. */ static inline void io_req_init_async(struct io_kiocb *req) { + struct io_uring_task *tctx = current->io_uring; + if (req->flags & REQ_F_WORK_INITIALIZED) return; - memset(&req->work, 0, sizeof(req->work)); - req->flags |= REQ_F_WORK_INITIALIZED; + __io_req_init_async(req); + + /* Grab a ref if this isn't our static identity */ + req->work.identity = tctx->identity; + if (tctx->identity != &tctx->__identity) + refcount_inc(&req->work.identity->count); } static inline bool io_async_submit(struct io_ring_ctx *ctx) @@ -1162,7 +1179,7 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) struct io_ring_ctx *ctx = req->ctx; return seq != ctx->cached_cq_tail - + atomic_read(&ctx->cached_cq_overflow); + + READ_ONCE(ctx->cached_cq_overflow); } return false; @@ -1181,105 +1198,198 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -/* - * Returns true if we need to defer file table putting. This can only happen - * from the error path with REQ_F_COMP_LOCKED set. - */ -static bool io_req_clean_work(struct io_kiocb *req) +static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req) +{ + if (req->work.identity == &tctx->__identity) + return; + if (refcount_dec_and_test(&req->work.identity->count)) + kfree(req->work.identity); +} + +static void io_req_clean_work(struct io_kiocb *req) { if (!(req->flags & REQ_F_WORK_INITIALIZED)) - return false; + return; req->flags &= ~REQ_F_WORK_INITIALIZED; - if (req->work.mm) { - mmdrop(req->work.mm); - req->work.mm = NULL; + if (req->work.flags & IO_WQ_WORK_MM) { + mmdrop(req->work.identity->mm); + req->work.flags &= ~IO_WQ_WORK_MM; } #ifdef CONFIG_BLK_CGROUP - if (req->work.blkcg_css) - css_put(req->work.blkcg_css); + if (req->work.flags & IO_WQ_WORK_BLKCG) { + css_put(req->work.identity->blkcg_css); + req->work.flags &= ~IO_WQ_WORK_BLKCG; + } #endif - if (req->work.creds) { - put_cred(req->work.creds); - req->work.creds = NULL; + if (req->work.flags & IO_WQ_WORK_CREDS) { + put_cred(req->work.identity->creds); + req->work.flags &= ~IO_WQ_WORK_CREDS; } - if (req->work.fs) { - struct fs_struct *fs = req->work.fs; - - if (req->flags & REQ_F_COMP_LOCKED) - return true; + if (req->work.flags & IO_WQ_WORK_FS) { + struct fs_struct *fs = req->work.identity->fs; - spin_lock(&req->work.fs->lock); + spin_lock(&req->work.identity->fs->lock); if (--fs->users) fs = NULL; - spin_unlock(&req->work.fs->lock); + spin_unlock(&req->work.identity->fs->lock); if (fs) free_fs_struct(fs); - req->work.fs = NULL; + req->work.flags &= ~IO_WQ_WORK_FS; } - return false; + io_put_identity(req->task->io_uring, req); } -static void io_prep_async_work(struct io_kiocb *req) +/* + * Create a private copy of io_identity, since some fields don't match + * the current context. + */ +static bool io_identity_cow(struct io_kiocb *req) +{ + struct io_uring_task *tctx = current->io_uring; + const struct cred *creds = NULL; + struct io_identity *id; + + if (req->work.flags & IO_WQ_WORK_CREDS) + creds = req->work.identity->creds; + + id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL); + if (unlikely(!id)) { + req->work.flags |= IO_WQ_WORK_CANCEL; + return false; + } + + /* + * We can safely just re-init the creds we copied Either the field + * matches the current one, or we haven't grabbed it yet. The only + * exception is ->creds, through registered personalities, so handle + * that one separately. + */ + io_init_identity(id); + if (creds) + req->work.identity->creds = creds; + + /* add one for this request */ + refcount_inc(&id->count); + + /* drop old identity, assign new one. one ref for req, one for tctx */ + if (req->work.identity != tctx->identity && + refcount_sub_and_test(2, &req->work.identity->count)) + kfree(req->work.identity); + + req->work.identity = id; + tctx->identity = id; + return true; +} + +static bool io_grab_identity(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; + struct io_identity *id = req->work.identity; struct io_ring_ctx *ctx = req->ctx; - io_req_init_async(req); - - if (req->flags & REQ_F_ISREG) { - if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) - io_wq_hash_work(&req->work, file_inode(req->file)); - } else { - if (def->unbound_nonreg_file) - req->work.flags |= IO_WQ_WORK_UNBOUND; + if (def->work_flags & IO_WQ_WORK_FSIZE) { + if (id->fsize != rlimit(RLIMIT_FSIZE)) + return false; + req->work.flags |= IO_WQ_WORK_FSIZE; } - if (!req->work.files && io_op_defs[req->opcode].file_table && + + if (!(req->work.flags & IO_WQ_WORK_FILES) && + (def->work_flags & IO_WQ_WORK_FILES) && !(req->flags & REQ_F_NO_FILE_TABLE)) { - req->work.files = get_files_struct(current); - get_nsproxy(current->nsproxy); - req->work.nsproxy = current->nsproxy; + if (id->files != current->files || + id->nsproxy != current->nsproxy) + return false; + atomic_inc(&id->files->count); + get_nsproxy(id->nsproxy); req->flags |= REQ_F_INFLIGHT; spin_lock_irq(&ctx->inflight_lock); list_add(&req->inflight_entry, &ctx->inflight_list); spin_unlock_irq(&ctx->inflight_lock); - } - if (!req->work.mm && def->needs_mm) { - mmgrab(current->mm); - req->work.mm = current->mm; + req->work.flags |= IO_WQ_WORK_FILES; } #ifdef CONFIG_BLK_CGROUP - if (!req->work.blkcg_css && def->needs_blkcg) { + if (!(req->work.flags & IO_WQ_WORK_BLKCG) && + (def->work_flags & IO_WQ_WORK_BLKCG)) { rcu_read_lock(); - req->work.blkcg_css = blkcg_css(); + if (id->blkcg_css != blkcg_css()) { + rcu_read_unlock(); + return false; + } /* * This should be rare, either the cgroup is dying or the task * is moving cgroups. Just punt to root for the handful of ios. */ - if (!css_tryget_online(req->work.blkcg_css)) - req->work.blkcg_css = NULL; + if (css_tryget_online(id->blkcg_css)) + req->work.flags |= IO_WQ_WORK_BLKCG; rcu_read_unlock(); } #endif - if (!req->work.creds) - req->work.creds = get_current_cred(); - if (!req->work.fs && def->needs_fs) { - spin_lock(¤t->fs->lock); - if (!current->fs->in_exec) { - req->work.fs = current->fs; - req->work.fs->users++; + if (!(req->work.flags & IO_WQ_WORK_CREDS)) { + if (id->creds != current_cred()) + return false; + get_cred(id->creds); + req->work.flags |= IO_WQ_WORK_CREDS; + } +#ifdef CONFIG_AUDIT + if (!uid_eq(current->loginuid, id->loginuid) || + current->sessionid != id->sessionid) + return false; +#endif + if (!(req->work.flags & IO_WQ_WORK_FS) && + (def->work_flags & IO_WQ_WORK_FS)) { + if (current->fs != id->fs) + return false; + spin_lock(&id->fs->lock); + if (!id->fs->in_exec) { + id->fs->users++; + req->work.flags |= IO_WQ_WORK_FS; } else { req->work.flags |= IO_WQ_WORK_CANCEL; } spin_unlock(¤t->fs->lock); } - if (def->needs_fsize) - req->work.fsize = rlimit(RLIMIT_FSIZE); - else - req->work.fsize = RLIM_INFINITY; + + return true; +} + +static void io_prep_async_work(struct io_kiocb *req) +{ + const struct io_op_def *def = &io_op_defs[req->opcode]; + struct io_ring_ctx *ctx = req->ctx; + struct io_identity *id; + + io_req_init_async(req); + id = req->work.identity; + + if (req->flags & REQ_F_ISREG) { + if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) + io_wq_hash_work(&req->work, file_inode(req->file)); + } else { + if (def->unbound_nonreg_file) + req->work.flags |= IO_WQ_WORK_UNBOUND; + } + + /* ->mm can never change on us */ + if (!(req->work.flags & IO_WQ_WORK_MM) && + (def->work_flags & IO_WQ_WORK_MM)) { + mmgrab(id->mm); + req->work.flags |= IO_WQ_WORK_MM; + } + + /* if we fail grabbing identity, we must COW, regrab, and retry */ + if (io_grab_identity(req)) + return; + + if (!io_identity_cow(req)) + return; + + /* can't fail at this point */ + if (!io_grab_identity(req)) + WARN_ON(1); } static void io_prep_async_link(struct io_kiocb *req) @@ -1325,9 +1435,8 @@ static void io_kill_timeout(struct io_kiocb *req) atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); - req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, 0); - io_put_req(req); + io_put_req_deferred(req, 1); } } @@ -1378,8 +1487,7 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) if (link) { __io_queue_linked_timeout(link); /* drop submission reference */ - link->flags |= REQ_F_COMP_LOCKED; - io_put_req(link); + io_put_req_deferred(link, 1); } kfree(de); } while (!list_empty(&ctx->defer_list)); @@ -1471,8 +1579,9 @@ static inline bool io_match_files(struct io_kiocb *req, { if (!files) return true; - if (req->flags & REQ_F_WORK_INITIALIZED) - return req->work.files == files; + if ((req->flags & REQ_F_WORK_INITIALIZED) && + (req->work.flags & IO_WQ_WORK_FILES)) + return req->work.identity->files == files; return false; } @@ -1518,8 +1627,9 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, WRITE_ONCE(cqe->res, req->result); WRITE_ONCE(cqe->flags, req->compl.cflags); } else { + ctx->cached_cq_overflow++; WRITE_ONCE(ctx->rings->cq_overflow, - atomic_inc_return(&ctx->cached_cq_overflow)); + ctx->cached_cq_overflow); } } @@ -1561,8 +1671,8 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) * then we cannot store the request for later flushing, we need * to drop it on the floor. */ - WRITE_ONCE(ctx->rings->cq_overflow, - atomic_inc_return(&ctx->cached_cq_overflow)); + ctx->cached_cq_overflow++; + WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow); } else { if (list_empty(&ctx->cq_overflow_list)) { set_bit(0, &ctx->sq_check_overflow); @@ -1606,13 +1716,19 @@ static void io_submit_flush_completions(struct io_comp_state *cs) req = list_first_entry(&cs->list, struct io_kiocb, compl.list); list_del(&req->compl.list); __io_cqring_fill_event(req, req->result, req->compl.cflags); - if (!(req->flags & REQ_F_LINK_HEAD)) { - req->flags |= REQ_F_COMP_LOCKED; - io_put_req(req); - } else { + + /* + * io_free_req() doesn't care about completion_lock unless one + * of these flags is set. REQ_F_WORK_INITIALIZED is in the list + * because of a potential deadlock with req->work.fs->lock + */ + if (req->flags & (REQ_F_FAIL_LINK|REQ_F_LINK_TIMEOUT + |REQ_F_WORK_INITIALIZED)) { spin_unlock_irq(&ctx->completion_lock); io_put_req(req); spin_lock_irq(&ctx->completion_lock); + } else { + io_put_req(req); } } io_commit_cqring(ctx); @@ -1699,7 +1815,7 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file, fput(file); } -static bool io_dismantle_req(struct io_kiocb *req) +static void io_dismantle_req(struct io_kiocb *req) { io_clean_op(req); @@ -1708,15 +1824,17 @@ static bool io_dismantle_req(struct io_kiocb *req) if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - return io_req_clean_work(req); + io_req_clean_work(req); } -static void __io_free_req_finish(struct io_kiocb *req) +static void __io_free_req(struct io_kiocb *req) { struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; - atomic_long_inc(&tctx->req_complete); + io_dismantle_req(req); + + percpu_counter_dec(&tctx->inflight); if (tctx->in_idle) wake_up(&tctx->wait); put_task_struct(req->task); @@ -1728,39 +1846,6 @@ static void __io_free_req_finish(struct io_kiocb *req) percpu_ref_put(&ctx->refs); } -static void io_req_task_file_table_put(struct callback_head *cb) -{ - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); - struct fs_struct *fs = req->work.fs; - - spin_lock(&req->work.fs->lock); - if (--fs->users) - fs = NULL; - spin_unlock(&req->work.fs->lock); - if (fs) - free_fs_struct(fs); - req->work.fs = NULL; - __io_free_req_finish(req); -} - -static void __io_free_req(struct io_kiocb *req) -{ - if (!io_dismantle_req(req)) { - __io_free_req_finish(req); - } else { - int ret; - - init_task_work(&req->task_work, io_req_task_file_table_put); - ret = task_work_add(req->task, &req->task_work, TWA_RESUME); - if (unlikely(ret)) { - struct task_struct *tsk; - - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, 0); - } - } -} - static bool io_link_cancel_timeout(struct io_kiocb *req) { struct io_timeout_data *io = req->async_data; @@ -1772,7 +1857,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); req->flags &= ~REQ_F_LINK_HEAD; - io_put_req(req); + io_put_req_deferred(req, 1); return true; } @@ -1789,9 +1874,14 @@ static bool __io_kill_linked_timeout(struct io_kiocb *req) link = list_first_entry(&req->link_list, struct io_kiocb, link_list); if (link->opcode != IORING_OP_LINK_TIMEOUT) return false; + /* + * Can happen if a linked timeout fired and link had been like + * req -> link t-out -> link t-out [-> ...] + */ + if (!(link->flags & REQ_F_LTIMEOUT_ACTIVE)) + return false; list_del_init(&link->link_list); - link->flags |= REQ_F_COMP_LOCKED; wake_ev = io_link_cancel_timeout(link); req->flags &= ~REQ_F_LINK_TIMEOUT; return wake_ev; @@ -1800,17 +1890,12 @@ static bool __io_kill_linked_timeout(struct io_kiocb *req) static void io_kill_linked_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; bool wake_ev; - if (!(req->flags & REQ_F_COMP_LOCKED)) { - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); - wake_ev = __io_kill_linked_timeout(req); - spin_unlock_irqrestore(&ctx->completion_lock, flags); - } else { - wake_ev = __io_kill_linked_timeout(req); - } + spin_lock_irqsave(&ctx->completion_lock, flags); + wake_ev = __io_kill_linked_timeout(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); if (wake_ev) io_cqring_ev_posted(ctx); @@ -1838,10 +1923,12 @@ static struct io_kiocb *io_req_link_next(struct io_kiocb *req) /* * Called if REQ_F_LINK_HEAD is set, and we fail the head request */ -static void __io_fail_links(struct io_kiocb *req) +static void io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { struct io_kiocb *link = list_first_entry(&req->link_list, struct io_kiocb, link_list); @@ -1850,28 +1937,20 @@ static void __io_fail_links(struct io_kiocb *req) trace_io_uring_fail_link(req, link); io_cqring_fill_event(link, -ECANCELED); - link->flags |= REQ_F_COMP_LOCKED; - __io_double_put_req(link); - req->flags &= ~REQ_F_LINK_TIMEOUT; + + /* + * It's ok to free under spinlock as they're not linked anymore, + * but avoid REQ_F_WORK_INITIALIZED because it may deadlock on + * work.fs->lock. + */ + if (link->flags & REQ_F_WORK_INITIALIZED) + io_put_req_deferred(link, 2); + else + io_double_put_req(link); } io_commit_cqring(ctx); - io_cqring_ev_posted(ctx); -} - -static void io_fail_links(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - - if (!(req->flags & REQ_F_COMP_LOCKED)) { - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); - __io_fail_links(req); - spin_unlock_irqrestore(&ctx->completion_lock, flags); - } else { - __io_fail_links(req); - } + spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); } @@ -1905,7 +1984,8 @@ static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok) { struct task_struct *tsk = req->task; struct io_ring_ctx *ctx = req->ctx; - int ret, notify; + enum task_work_notify_mode notify; + int ret; if (tsk->flags & PF_EXITING) return -ESRCH; @@ -1916,7 +1996,7 @@ static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok) * processing task_work. There's no reliable way to tell if TWA_RESUME * will do the job. */ - notify = 0; + notify = TWA_NONE; if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok) notify = TWA_SIGNAL; @@ -1985,7 +2065,7 @@ static void io_req_task_queue(struct io_kiocb *req) init_task_work(&req->task_work, io_req_task_cancel); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, 0); + task_work_add(tsk, &req->task_work, TWA_NONE); wake_up_process(tsk); } } @@ -2033,7 +2113,9 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx, if (rb->to_free) __io_req_free_batch_flush(ctx, rb); if (rb->task) { - atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete); + struct io_uring_task *tctx = rb->task->io_uring; + + percpu_counter_sub(&tctx->inflight, rb->task_refs); put_task_struct_many(rb->task, rb->task_refs); rb->task = NULL; } @@ -2050,7 +2132,9 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) if (req->task != rb->task) { if (rb->task) { - atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete); + struct io_uring_task *tctx = rb->task->io_uring; + + percpu_counter_sub(&tctx->inflight, rb->task_refs); put_task_struct_many(rb->task, rb->task_refs); } rb->task = req->task; @@ -2058,7 +2142,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) } rb->task_refs++; - WARN_ON_ONCE(io_dismantle_req(req)); + io_dismantle_req(req); rb->reqs[rb->to_free++] = req; if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) __io_req_free_batch_flush(req->ctx, rb); @@ -2085,6 +2169,34 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } +static void io_put_req_deferred_cb(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + + io_free_req(req); +} + +static void io_free_req_deferred(struct io_kiocb *req) +{ + int ret; + + init_task_work(&req->task_work, io_put_req_deferred_cb); + ret = io_req_task_work_add(req, true); + if (unlikely(ret)) { + struct task_struct *tsk; + + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, TWA_NONE); + wake_up_process(tsk); + } +} + +static inline void io_put_req_deferred(struct io_kiocb *req, int refs) +{ + if (refcount_sub_and_test(refs, &req->refs)) + io_free_req_deferred(req); +} + static struct io_wq_work *io_steal_work(struct io_kiocb *req) { struct io_kiocb *nxt; @@ -2101,17 +2213,6 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req) return nxt ? &nxt->work : NULL; } -/* - * Must only be used if we don't need to care about links, usually from - * within the completion handling itself. - */ -static void __io_double_put_req(struct io_kiocb *req) -{ - /* drop both submit and complete references */ - if (refcount_sub_and_test(2, &req->refs)) - __io_free_req(req); -} - static void io_double_put_req(struct io_kiocb *req) { /* drop both submit and complete references */ @@ -2601,7 +2702,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) static bool io_bdev_nowait(struct block_device *bdev) { #ifdef CONFIG_BLOCK - return !bdev || queue_is_mq(bdev_get_queue(bdev)); + return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); #else return true; #endif @@ -3016,9 +3117,10 @@ static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) * For files that don't have ->read_iter() and ->write_iter(), handle them * by looping over ->read() or ->write() manually. */ -static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, - struct iov_iter *iter) +static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) { + struct kiocb *kiocb = &req->rw.kiocb; + struct file *file = req->file; ssize_t ret = 0; /* @@ -3038,11 +3140,8 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, if (!iov_iter_is_bvec(iter)) { iovec = iov_iter_iovec(iter); } else { - /* fixed buffers import bvec */ - iovec.iov_base = kmap(iter->bvec->bv_page) - + iter->iov_offset; - iovec.iov_len = min(iter->count, - iter->bvec->bv_len - iter->iov_offset); + iovec.iov_base = u64_to_user_ptr(req->rw.addr); + iovec.iov_len = req->rw.len; } if (rw == READ) { @@ -3053,9 +3152,6 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, iovec.iov_len, io_kiocb_ppos(kiocb)); } - if (iov_iter_is_bvec(iter)) - kunmap(iter->bvec->bv_page); - if (nr < 0) { if (!ret) ret = nr; @@ -3064,6 +3160,8 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, ret += nr; if (nr != iovec.iov_len) break; + req->rw.len -= nr; + req->rw.addr += nr; iov_iter_advance(iter, nr); } @@ -3199,7 +3297,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, /* queue just for cancelation */ init_task_work(&req->task_work, io_req_task_cancel); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, 0); + task_work_add(tsk, &req->task_work, TWA_NONE); wake_up_process(tsk); } return 1; @@ -3253,7 +3351,7 @@ static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) if (req->file->f_op->read_iter) return call_read_iter(req->file, &req->rw.kiocb, iter); else if (req->file->f_op->read) - return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); + return loop_rw_iter(READ, req, iter); else return -EINVAL; } @@ -3444,7 +3542,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, if (req->file->f_op->write_iter) ret2 = call_write_iter(req->file, kiocb, iter); else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, req->file, kiocb, iter); + ret2 = loop_rw_iter(WRITE, req, iter); else ret2 = -EINVAL; @@ -4123,7 +4221,7 @@ static int io_close(struct io_kiocb *req, bool force_nonblock, } /* No ->flush() or already async, safely close from here */ - ret = filp_close(close->put_file, req->work.files); + ret = filp_close(close->put_file, req->work.identity->files); if (ret < 0) req_set_fail_links(req); fput(close->put_file); @@ -4765,7 +4863,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, WRITE_ONCE(poll->canceled, true); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, 0); + task_work_add(tsk, &req->task_work, TWA_NONE); wake_up_process(tsk); } return 1; @@ -4834,33 +4932,25 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) io_commit_cqring(ctx); } -static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) +static void io_poll_task_func(struct callback_head *cb) { + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *nxt; if (io_poll_rewait(req, &req->poll)) { spin_unlock_irq(&ctx->completion_lock); - return; - } - - hash_del(&req->hash_node); - io_poll_complete(req, req->result, 0); - req->flags |= REQ_F_COMP_LOCKED; - *nxt = io_put_req_find_next(req); - spin_unlock_irq(&ctx->completion_lock); - - io_cqring_ev_posted(ctx); -} + } else { + hash_del(&req->hash_node); + io_poll_complete(req, req->result, 0); + spin_unlock_irq(&ctx->completion_lock); -static void io_poll_task_func(struct callback_head *cb) -{ - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); - struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *nxt = NULL; + nxt = io_put_req_find_next(req); + io_cqring_ev_posted(ctx); + if (nxt) + __io_req_task_submit(nxt); + } - io_poll_task_handler(req, &nxt); - if (nxt) - __io_req_task_submit(nxt); percpu_ref_put(&ctx->refs); } @@ -4917,6 +5007,8 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, * for write). Setup a separate io_poll_iocb if this happens. */ if (unlikely(poll->head)) { + struct io_poll_iocb *poll_one = poll; + /* already have a 2nd entry, fail a third attempt */ if (*poll_ptr) { pt->error = -EINVAL; @@ -4927,7 +5019,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = -ENOMEM; return; } - io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake); + io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake); refcount_inc(&req->refs); poll->wait.private = req; *poll_ptr = poll; @@ -5012,6 +5104,7 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, struct io_ring_ctx *ctx = req->ctx; bool cancel = false; + INIT_HLIST_NODE(&req->hash_node); io_init_poll_iocb(poll, mask, wake_func); poll->file = req->file; poll->wait.private = req; @@ -5073,7 +5166,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) req->flags |= REQ_F_POLLED; req->apoll = apoll; - INIT_HLIST_NODE(&req->hash_node); mask = 0; if (def->pollin) @@ -5144,9 +5236,8 @@ static bool io_poll_remove_one(struct io_kiocb *req) if (do_complete) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(req->ctx); - req->flags |= REQ_F_COMP_LOCKED; req_set_fail_links(req); - io_put_req(req); + io_put_req_deferred(req, 1); } return do_complete; @@ -5256,8 +5347,6 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EINVAL; if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) return -EINVAL; - if (!poll->file) - return -EBADF; events = READ_ONCE(sqe->poll32_events); #ifdef __BIG_ENDIAN @@ -5275,7 +5364,6 @@ static int io_poll_add(struct io_kiocb *req) struct io_poll_table ipt; __poll_t mask; - INIT_HLIST_NODE(&req->hash_node); ipt.pt._qproc = io_poll_queue_proc; mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, @@ -5328,9 +5416,8 @@ static int __io_timeout_cancel(struct io_kiocb *req) list_del_init(&req->timeout.list); req_set_fail_links(req); - req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, -ECANCELED); - io_put_req(req); + io_put_req_deferred(req, 1); return 0; } @@ -5740,9 +5827,9 @@ static void io_req_drop_files(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); req->flags &= ~REQ_F_INFLIGHT; - put_files_struct(req->work.files); - put_nsproxy(req->work.nsproxy); - req->work.files = NULL; + put_files_struct(req->work.identity->files); + put_nsproxy(req->work.identity->nsproxy); + req->work.flags &= ~IO_WQ_WORK_FILES; } static void __io_clean_op(struct io_kiocb *req) @@ -6026,10 +6113,9 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) if (!list_empty(&req->link_list)) { prev = list_entry(req->link_list.prev, struct io_kiocb, link_list); - if (refcount_inc_not_zero(&prev->refs)) { + if (refcount_inc_not_zero(&prev->refs)) list_del_init(&req->link_list); - prev->flags &= ~REQ_F_LINK_TIMEOUT; - } else + else prev = NULL; } @@ -6086,6 +6172,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT) return NULL; + nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; req->flags |= REQ_F_LINK_TIMEOUT; return nxt; } @@ -6100,14 +6187,15 @@ static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs) again: linked_timeout = io_prep_linked_timeout(req); - if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds && - req->work.creds != current_cred()) { + if ((req->flags & REQ_F_WORK_INITIALIZED) && + (req->work.flags & IO_WQ_WORK_CREDS) && + req->work.identity->creds != current_cred()) { if (old_creds) revert_creds(old_creds); - if (old_creds == req->work.creds) + if (old_creds == req->work.identity->creds) old_creds = NULL; /* restored original creds */ else - old_creds = override_creds(req->work.creds); + old_creds = override_creds(req->work.identity->creds); } ret = io_issue_sqe(req, true, cs); @@ -6148,8 +6236,10 @@ punt: if (nxt) { req = nxt; - if (req->flags & REQ_F_FORCE_ASYNC) + if (req->flags & REQ_F_FORCE_ASYNC) { + linked_timeout = NULL; goto punt; + } goto again; } exit: @@ -6410,11 +6500,17 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, id = READ_ONCE(sqe->personality); if (id) { - io_req_init_async(req); - req->work.creds = idr_find(&ctx->personality_idr, id); - if (unlikely(!req->work.creds)) + struct io_identity *iod; + + iod = idr_find(&ctx->personality_idr, id); + if (unlikely(!iod)) return -EINVAL; - get_cred(req->work.creds); + refcount_inc(&iod->count); + + __io_req_init_async(req); + get_cred(iod->creds); + req->work.identity = iod; + req->work.flags |= IO_WQ_WORK_CREDS; } /* same numerical values with corresponding REQ_F_*, safe to copy */ @@ -6447,7 +6543,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; - atomic_long_add(nr, ¤t->io_uring->req_issue); + percpu_counter_add(¤t->io_uring->inflight, nr); refcount_add(nr, ¤t->usage); io_submit_state_start(&state, ctx, nr); @@ -6489,10 +6585,12 @@ fail_req: if (unlikely(submitted != nr)) { int ref_used = (submitted == -EAGAIN) ? 0 : submitted; + struct io_uring_task *tctx = current->io_uring; + int unused = nr - ref_used; - percpu_ref_put_many(&ctx->refs, nr - ref_used); - atomic_long_sub(nr - ref_used, ¤t->io_uring->req_issue); - put_task_struct_many(current, nr - ref_used); + percpu_ref_put_many(&ctx->refs, unused); + percpu_counter_sub(&tctx->inflight, unused); + put_task_struct_many(current, unused); } if (link) io_queue_link_head(link, &state.comp); @@ -6672,6 +6770,10 @@ static int io_sq_thread(void *data) old_cred = override_creds(ctx->creds); } io_sq_thread_associate_blkcg(ctx, &cur_css); +#ifdef CONFIG_AUDIT + current->loginuid = ctx->loginuid; + current->sessionid = ctx->sessionid; +#endif ret |= __io_sq_thread(ctx, start_jiffies, cap_entries); @@ -7306,7 +7408,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, spin_lock_init(&file_data->lock); nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); - file_data->table = kcalloc(nr_tables, sizeof(file_data->table), + file_data->table = kcalloc(nr_tables, sizeof(*file_data->table), GFP_KERNEL); if (!file_data->table) goto out_free; @@ -7317,6 +7419,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args)) goto out_ref; + ctx->file_data = file_data; for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { struct fixed_file_table *table; @@ -7351,7 +7454,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, table->files[index] = file; } - ctx->file_data = file_data; ret = io_sqe_files_scm(ctx); if (ret) { io_sqe_files_unregister(ctx); @@ -7384,6 +7486,7 @@ out_ref: out_free: kfree(file_data->table); kfree(file_data); + ctx->file_data = NULL; return ret; } @@ -7609,17 +7712,24 @@ out_fput: static int io_uring_alloc_task_context(struct task_struct *task) { struct io_uring_task *tctx; + int ret; tctx = kmalloc(sizeof(*tctx), GFP_KERNEL); if (unlikely(!tctx)) return -ENOMEM; + ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); + if (unlikely(ret)) { + kfree(tctx); + return ret; + } + xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); tctx->last = NULL; tctx->in_idle = 0; - atomic_long_set(&tctx->req_issue, 0); - atomic_long_set(&tctx->req_complete, 0); + io_init_identity(&tctx->__identity); + tctx->identity = &tctx->__identity; task->io_uring = tctx; return 0; } @@ -7629,6 +7739,10 @@ void __io_uring_free(struct task_struct *tsk) struct io_uring_task *tctx = tsk->io_uring; WARN_ON_ONCE(!xa_empty(&tctx->xa)); + WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1); + if (tctx->identity != &tctx->__identity) + kfree(tctx->identity); + percpu_counter_destroy(&tctx->inflight); kfree(tctx); tsk->io_uring = NULL; } @@ -8205,11 +8319,14 @@ static int io_uring_fasync(int fd, struct file *file, int on) static int io_remove_personalities(int id, void *p, void *data) { struct io_ring_ctx *ctx = data; - const struct cred *cred; + struct io_identity *iod; - cred = idr_remove(&ctx->personality_idr, id); - if (cred) - put_cred(cred); + iod = idr_remove(&ctx->personality_idr, id); + if (iod) { + put_cred(iod->creds); + if (refcount_dec_and_test(&iod->count)) + kfree(iod); + } return 0; } @@ -8281,7 +8398,8 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data) { struct files_struct *files = data; - return !files || work->files == files; + return !files || ((work->flags & IO_WQ_WORK_FILES) && + work->identity->files == files); } /* @@ -8436,7 +8554,8 @@ static bool io_uring_cancel_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { - if (files && req->work.files != files) + if (files && (req->work.flags & IO_WQ_WORK_FILES) && + req->work.identity->files != files) continue; /* req is being completed, ignore */ if (!refcount_inc_not_zero(&req->refs)) @@ -8564,19 +8683,11 @@ static void io_uring_del_task_file(struct file *file) fput(file); } -static void __io_uring_attempt_task_drop(struct file *file) -{ - struct file *old = xa_load(¤t->io_uring->xa, (unsigned long)file); - - if (old == file) - io_uring_del_task_file(file); -} - /* * Drop task note for this file if we're the only ones that hold it after * pending fput() */ -static void io_uring_attempt_task_drop(struct file *file, bool exiting) +static void io_uring_attempt_task_drop(struct file *file) { if (!current->io_uring) return; @@ -8584,10 +8695,9 @@ static void io_uring_attempt_task_drop(struct file *file, bool exiting) * fput() is pending, will be 2 if the only other ref is our potential * task file note. If the task is exiting, drop regardless of count. */ - if (!exiting && atomic_long_read(&file->f_count) != 2) - return; - - __io_uring_attempt_task_drop(file); + if (fatal_signal_pending(current) || (current->flags & PF_EXITING) || + atomic_long_read(&file->f_count) == 2) + io_uring_del_task_file(file); } void __io_uring_files_cancel(struct files_struct *files) @@ -8608,12 +8718,6 @@ void __io_uring_files_cancel(struct files_struct *files) } } -static inline bool io_uring_task_idle(struct io_uring_task *tctx) -{ - return atomic_long_read(&tctx->req_issue) == - atomic_long_read(&tctx->req_complete); -} - /* * Find any io_uring fd that this task has registered or done IO on, and cancel * requests. @@ -8622,14 +8726,16 @@ void __io_uring_task_cancel(void) { struct io_uring_task *tctx = current->io_uring; DEFINE_WAIT(wait); - long completions; + s64 inflight; /* make sure overflow events are dropped */ tctx->in_idle = true; - while (!io_uring_task_idle(tctx)) { + do { /* read completions before cancelations */ - completions = atomic_long_read(&tctx->req_complete); + inflight = percpu_counter_sum(&tctx->inflight); + if (!inflight) + break; __io_uring_files_cancel(NULL); prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); @@ -8638,12 +8744,10 @@ void __io_uring_task_cancel(void) * If we've seen completions, retry. This avoids a race where * a completion comes in before we did prepare_to_wait(). */ - if (completions != atomic_long_read(&tctx->req_complete)) + if (inflight != percpu_counter_sum(&tctx->inflight)) continue; - if (io_uring_task_idle(tctx)) - break; schedule(); - } + } while (1); finish_wait(&tctx->wait, &wait); tctx->in_idle = false; @@ -8651,16 +8755,7 @@ void __io_uring_task_cancel(void) static int io_uring_flush(struct file *file, void *data) { - struct io_ring_ctx *ctx = file->private_data; - - /* - * If the task is going away, cancel work it may have pending - */ - if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) - data = NULL; - - io_uring_cancel_task_requests(ctx, data); - io_uring_attempt_task_drop(file, !data); + io_uring_attempt_task_drop(file); return 0; } @@ -9109,7 +9204,10 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->compat = in_compat_syscall(); ctx->user = user; ctx->creds = get_current_cred(); - +#ifdef CONFIG_AUDIT + ctx->loginuid = current->loginuid; + ctx->sessionid = current->sessionid; +#endif ctx->sqo_task = get_task_struct(current); /* @@ -9277,23 +9375,33 @@ out: static int io_register_personality(struct io_ring_ctx *ctx) { - const struct cred *creds = get_current_cred(); - int id; + struct io_identity *id; + int ret; + + id = kmalloc(sizeof(*id), GFP_KERNEL); + if (unlikely(!id)) + return -ENOMEM; - id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1, - USHRT_MAX, GFP_KERNEL); - if (id < 0) - put_cred(creds); - return id; + io_init_identity(id); + id->creds = get_current_cred(); + + ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL); + if (ret < 0) { + put_cred(id->creds); + kfree(id); + } + return ret; } static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) { - const struct cred *old_creds; + struct io_identity *iod; - old_creds = idr_remove(&ctx->personality_idr, id); - if (old_creds) { - put_cred(old_creds); + iod = idr_remove(&ctx->personality_idr, id); + if (iod) { + put_cred(iod->creds); + if (refcount_dec_and_test(&iod->count)) + kfree(iod); return 0; } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 78f5c96c76f3..ec90773527ee 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1038,8 +1038,7 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = 0; buf->f_files = ISOFS_SB(sb)->s_ninodes; buf->f_ffree = 0; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = NAME_MAX; return 0; } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6d2da8ad0e6f..fa688e163a80 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -187,20 +187,48 @@ static int journal_wait_on_commit_record(journal_t *journal, * use writepages() because with delayed allocation we may be doing * block allocation in writepages(). */ -static int journal_submit_inode_data_buffers(struct address_space *mapping, - loff_t dirty_start, loff_t dirty_end) +int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) { - int ret; + struct address_space *mapping = jinode->i_vfs_inode->i_mapping; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = mapping->nrpages * 2, - .range_start = dirty_start, - .range_end = dirty_end, + .range_start = jinode->i_dirty_start, + .range_end = jinode->i_dirty_end, }; - ret = generic_writepages(mapping, &wbc); - return ret; + /* + * submit the inode data buffers. We use writepage + * instead of writepages. Because writepages can do + * block allocation with delalloc. We need to write + * only allocated blocks here. + */ + return generic_writepages(mapping, &wbc); +} + +/* Send all the data buffers related to an inode */ +int jbd2_submit_inode_data(struct jbd2_inode *jinode) +{ + + if (!jinode || !(jinode->i_flags & JI_WRITE_DATA)) + return 0; + + trace_jbd2_submit_inode_data(jinode->i_vfs_inode); + return jbd2_journal_submit_inode_data_buffers(jinode); + +} +EXPORT_SYMBOL(jbd2_submit_inode_data); + +int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode) +{ + if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) || + !jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping) + return 0; + return filemap_fdatawait_range_keep_errors( + jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start, + jinode->i_dirty_end); } +EXPORT_SYMBOL(jbd2_wait_inode_data); /* * Submit all the data buffers of inode associated with the transaction to @@ -215,29 +243,20 @@ static int journal_submit_data_buffers(journal_t *journal, { struct jbd2_inode *jinode; int err, ret = 0; - struct address_space *mapping; spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { - loff_t dirty_start = jinode->i_dirty_start; - loff_t dirty_end = jinode->i_dirty_end; - if (!(jinode->i_flags & JI_WRITE_DATA)) continue; - mapping = jinode->i_vfs_inode->i_mapping; jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); - /* - * submit the inode data buffers. We use writepage - * instead of writepages. Because writepages can do - * block allocation with delalloc. We need to write - * only allocated blocks here. - */ + /* submit the inode data buffers. */ trace_jbd2_submit_inode_data(jinode->i_vfs_inode); - err = journal_submit_inode_data_buffers(mapping, dirty_start, - dirty_end); - if (!ret) - ret = err; + if (journal->j_submit_inode_data_buffers) { + err = journal->j_submit_inode_data_buffers(jinode); + if (!ret) + ret = err; + } spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); jinode->i_flags &= ~JI_COMMIT_RUNNING; @@ -248,6 +267,15 @@ static int journal_submit_data_buffers(journal_t *journal, return ret; } +int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) +{ + struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + + return filemap_fdatawait_range_keep_errors(mapping, + jinode->i_dirty_start, + jinode->i_dirty_end); +} + /* * Wait for data submitted for writeout, refile inodes to proper * transaction if needed. @@ -262,18 +290,16 @@ static int journal_finish_inode_data_buffers(journal_t *journal, /* For locking, see the comment in journal_submit_data_buffers() */ spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { - loff_t dirty_start = jinode->i_dirty_start; - loff_t dirty_end = jinode->i_dirty_end; - if (!(jinode->i_flags & JI_WAIT_DATA)) continue; jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); - err = filemap_fdatawait_range_keep_errors( - jinode->i_vfs_inode->i_mapping, dirty_start, - dirty_end); - if (!ret) - ret = err; + /* wait for the inode data buffers writeout. */ + if (journal->j_finish_inode_data_buffers) { + err = journal->j_finish_inode_data_buffers(jinode); + if (!ret) + ret = err; + } spin_lock(&journal->j_list_lock); jinode->i_flags &= ~JI_COMMIT_RUNNING; smp_mb(); @@ -413,6 +439,20 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT(journal->j_running_transaction != NULL); J_ASSERT(journal->j_committing_transaction == NULL); + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_FULL_COMMIT_ONGOING; + while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_fc_wait, &wait, + TASK_UNINTERRUPTIBLE); + write_unlock(&journal->j_state_lock); + schedule(); + write_lock(&journal->j_state_lock); + finish_wait(&journal->j_fc_wait, &wait); + } + write_unlock(&journal->j_state_lock); + commit_transaction = journal->j_running_transaction; trace_jbd2_start_commit(journal, commit_transaction); @@ -420,6 +460,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction->t_tid); write_lock(&journal->j_state_lock); + journal->j_fc_off = 0; J_ASSERT(commit_transaction->t_state == T_RUNNING); commit_transaction->t_state = T_LOCKED; @@ -1119,12 +1160,16 @@ restart_loop: if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); + if (journal->j_fc_cleanup_callback) + journal->j_fc_cleanup_callback(journal, 1); trace_jbd2_end_commit(journal, commit_transaction); jbd_debug(1, "JBD2: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); write_lock(&journal->j_state_lock); + journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING; + journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; spin_lock(&journal->j_list_lock); commit_transaction->t_state = T_FINISHED; /* Check if the transaction can be dropped now that we are finished */ @@ -1136,6 +1181,7 @@ restart_loop: spin_unlock(&journal->j_list_lock); write_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_done_commit); + wake_up(&journal->j_fc_wait); /* * Calculate overall stats diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 17fdc482f554..0c7c42bd530f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -91,6 +91,8 @@ EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait); +EXPORT_SYMBOL(jbd2_journal_submit_inode_data_buffers); +EXPORT_SYMBOL(jbd2_journal_finish_inode_data_buffers); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); @@ -157,7 +159,9 @@ static void commit_timeout(struct timer_list *t) * * 1) COMMIT: Every so often we need to commit the current state of the * filesystem to disk. The journal thread is responsible for writing - * all of the metadata buffers to disk. + * all of the metadata buffers to disk. If a fast commit is ongoing + * journal thread waits until it's done and then continues from + * there on. * * 2) CHECKPOINT: We cannot reuse a used section of the log file until all * of the data in that part of the log has been rewritten elsewhere on @@ -714,6 +718,75 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) return err; } +/* + * Start a fast commit. If there's an ongoing fast or full commit wait for + * it to complete. Returns 0 if a new fast commit was started. Returns -EALREADY + * if a fast commit is not needed, either because there's an already a commit + * going on or this tid has already been committed. Returns -EINVAL if no jbd2 + * commit has yet been performed. + */ +int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) +{ + /* + * Fast commits only allowed if at least one full commit has + * been processed. + */ + if (!journal->j_stats.ts_tid) + return -EINVAL; + + if (tid <= journal->j_commit_sequence) + return -EALREADY; + + write_lock(&journal->j_state_lock); + if (journal->j_flags & JBD2_FULL_COMMIT_ONGOING || + (journal->j_flags & JBD2_FAST_COMMIT_ONGOING)) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_fc_wait, &wait, + TASK_UNINTERRUPTIBLE); + write_unlock(&journal->j_state_lock); + schedule(); + finish_wait(&journal->j_fc_wait, &wait); + return -EALREADY; + } + journal->j_flags |= JBD2_FAST_COMMIT_ONGOING; + write_unlock(&journal->j_state_lock); + + return 0; +} +EXPORT_SYMBOL(jbd2_fc_begin_commit); + +/* + * Stop a fast commit. If fallback is set, this function starts commit of + * TID tid before any other fast commit can start. + */ +static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) +{ + if (journal->j_fc_cleanup_callback) + journal->j_fc_cleanup_callback(journal, 0); + write_lock(&journal->j_state_lock); + journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; + if (fallback) + journal->j_flags |= JBD2_FULL_COMMIT_ONGOING; + write_unlock(&journal->j_state_lock); + wake_up(&journal->j_fc_wait); + if (fallback) + return jbd2_complete_transaction(journal, tid); + return 0; +} + +int jbd2_fc_end_commit(journal_t *journal) +{ + return __jbd2_fc_end_commit(journal, 0, 0); +} +EXPORT_SYMBOL(jbd2_fc_end_commit); + +int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid) +{ + return __jbd2_fc_end_commit(journal, tid, 1); +} +EXPORT_SYMBOL(jbd2_fc_end_commit_fallback); + /* Return 1 when transaction with given tid has already committed. */ int jbd2_transaction_committed(journal_t *journal, tid_t tid) { @@ -782,6 +855,110 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp) return jbd2_journal_bmap(journal, blocknr, retp); } +/* Map one fast commit buffer for use by the file system */ +int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) +{ + unsigned long long pblock; + unsigned long blocknr; + int ret = 0; + struct buffer_head *bh; + int fc_off; + + *bh_out = NULL; + write_lock(&journal->j_state_lock); + + if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) { + fc_off = journal->j_fc_off; + blocknr = journal->j_fc_first + fc_off; + journal->j_fc_off++; + } else { + ret = -EINVAL; + } + write_unlock(&journal->j_state_lock); + + if (ret) + return ret; + + ret = jbd2_journal_bmap(journal, blocknr, &pblock); + if (ret) + return ret; + + bh = __getblk(journal->j_dev, pblock, journal->j_blocksize); + if (!bh) + return -ENOMEM; + + lock_buffer(bh); + + clear_buffer_uptodate(bh); + set_buffer_dirty(bh); + unlock_buffer(bh); + journal->j_fc_wbuf[fc_off] = bh; + + *bh_out = bh; + + return 0; +} +EXPORT_SYMBOL(jbd2_fc_get_buf); + +/* + * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf + * for completion. + */ +int jbd2_fc_wait_bufs(journal_t *journal, int num_blks) +{ + struct buffer_head *bh; + int i, j_fc_off; + + read_lock(&journal->j_state_lock); + j_fc_off = journal->j_fc_off; + read_unlock(&journal->j_state_lock); + + /* + * Wait in reverse order to minimize chances of us being woken up before + * all IOs have completed + */ + for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) { + bh = journal->j_fc_wbuf[i]; + wait_on_buffer(bh); + put_bh(bh); + journal->j_fc_wbuf[i] = NULL; + if (unlikely(!buffer_uptodate(bh))) + return -EIO; + } + + return 0; +} +EXPORT_SYMBOL(jbd2_fc_wait_bufs); + +/* + * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf + * for completion. + */ +int jbd2_fc_release_bufs(journal_t *journal) +{ + struct buffer_head *bh; + int i, j_fc_off; + + read_lock(&journal->j_state_lock); + j_fc_off = journal->j_fc_off; + read_unlock(&journal->j_state_lock); + + /* + * Wait in reverse order to minimize chances of us being woken up before + * all IOs have completed + */ + for (i = j_fc_off - 1; i >= 0; i--) { + bh = journal->j_fc_wbuf[i]; + if (!bh) + break; + put_bh(bh); + journal->j_fc_wbuf[i] = NULL; + } + + return 0; +} +EXPORT_SYMBOL(jbd2_fc_release_bufs); + /* * Conversion of logical to physical block numbers for the journal * @@ -1140,6 +1317,7 @@ static journal_t *journal_init_common(struct block_device *bdev, init_waitqueue_head(&journal->j_wait_commit); init_waitqueue_head(&journal->j_wait_updates); init_waitqueue_head(&journal->j_wait_reserved); + init_waitqueue_head(&journal->j_fc_wait); mutex_init(&journal->j_abort_mutex); mutex_init(&journal->j_barrier); mutex_init(&journal->j_checkpoint_mutex); @@ -1179,6 +1357,14 @@ static journal_t *journal_init_common(struct block_device *bdev, if (!journal->j_wbuf) goto err_cleanup; + if (journal->j_fc_wbufsize > 0) { + journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize, + sizeof(struct buffer_head *), + GFP_KERNEL); + if (!journal->j_fc_wbuf) + goto err_cleanup; + } + bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize); if (!bh) { pr_err("%s: Cannot get buffer for journal superblock\n", @@ -1192,11 +1378,23 @@ static journal_t *journal_init_common(struct block_device *bdev, err_cleanup: kfree(journal->j_wbuf); + kfree(journal->j_fc_wbuf); jbd2_journal_destroy_revoke(journal); kfree(journal); return NULL; } +int jbd2_fc_init(journal_t *journal, int num_fc_blks) +{ + journal->j_fc_wbufsize = num_fc_blks; + journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize, + sizeof(struct buffer_head *), GFP_KERNEL); + if (!journal->j_fc_wbuf) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL(jbd2_fc_init); + /* jbd2_journal_init_dev and jbd2_journal_init_inode: * * Create a journal structure assigned some fixed set of disk blocks to @@ -1314,11 +1512,20 @@ static int journal_reset(journal_t *journal) } journal->j_first = first; - journal->j_last = last; - journal->j_head = first; - journal->j_tail = first; - journal->j_free = last - first; + if (jbd2_has_feature_fast_commit(journal) && + journal->j_fc_wbufsize > 0) { + journal->j_fc_last = last; + journal->j_last = last - journal->j_fc_wbufsize; + journal->j_fc_first = journal->j_last + 1; + journal->j_fc_off = 0; + } else { + journal->j_last = last; + } + + journal->j_head = journal->j_first; + journal->j_tail = journal->j_first; + journal->j_free = journal->j_last - journal->j_first; journal->j_tail_sequence = journal->j_transaction_sequence; journal->j_commit_sequence = journal->j_transaction_sequence - 1; @@ -1464,6 +1671,7 @@ out: static void jbd2_mark_journal_empty(journal_t *journal, int write_op) { journal_superblock_t *sb = journal->j_superblock; + bool had_fast_commit = false; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); lock_buffer(journal->j_sb_buffer); @@ -1477,9 +1685,20 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op) sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); sb->s_start = cpu_to_be32(0); + if (jbd2_has_feature_fast_commit(journal)) { + /* + * When journal is clean, no need to commit fast commit flag and + * make file system incompatible with older kernels. + */ + jbd2_clear_feature_fast_commit(journal); + had_fast_commit = true; + } jbd2_write_superblock(journal, write_op); + if (had_fast_commit) + jbd2_set_feature_fast_commit(journal); + /* Log is no longer empty */ write_lock(&journal->j_state_lock); journal->j_flags |= JBD2_FLUSHED; @@ -1663,9 +1882,18 @@ static int load_superblock(journal_t *journal) journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); journal->j_tail = be32_to_cpu(sb->s_start); journal->j_first = be32_to_cpu(sb->s_first); - journal->j_last = be32_to_cpu(sb->s_maxlen); journal->j_errno = be32_to_cpu(sb->s_errno); + if (jbd2_has_feature_fast_commit(journal) && + journal->j_fc_wbufsize > 0) { + journal->j_fc_last = be32_to_cpu(sb->s_maxlen); + journal->j_last = journal->j_fc_last - journal->j_fc_wbufsize; + journal->j_fc_first = journal->j_last + 1; + journal->j_fc_off = 0; + } else { + journal->j_last = be32_to_cpu(sb->s_maxlen); + } + return 0; } @@ -1726,6 +1954,9 @@ int jbd2_journal_load(journal_t *journal) */ journal->j_flags &= ~JBD2_ABORT; + if (journal->j_fc_wbufsize > 0) + jbd2_journal_set_features(journal, 0, 0, + JBD2_FEATURE_INCOMPAT_FAST_COMMIT); /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory * and reset them on disk. */ @@ -1809,6 +2040,8 @@ int jbd2_journal_destroy(journal_t *journal) jbd2_journal_destroy_revoke(journal); if (journal->j_chksum_driver) crypto_free_shash(journal->j_chksum_driver); + if (journal->j_fc_wbufsize > 0) + kfree(journal->j_fc_wbuf); kfree(journal->j_wbuf); kfree(journal); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index faa97d748474..eb2606133cd8 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -35,7 +35,6 @@ struct recovery_info int nr_revoke_hits; }; -enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass); static int scan_revoke_records(journal_t *, struct buffer_head *, @@ -225,10 +224,51 @@ static int count_tags(journal_t *journal, struct buffer_head *bh) /* Make sure we wrap around the log correctly! */ #define wrap(journal, var) \ do { \ - if (var >= (journal)->j_last) \ - var -= ((journal)->j_last - (journal)->j_first); \ + unsigned long _wrap_last = \ + jbd2_has_feature_fast_commit(journal) ? \ + (journal)->j_fc_last : (journal)->j_last; \ + \ + if (var >= _wrap_last) \ + var -= (_wrap_last - (journal)->j_first); \ } while (0) +static int fc_do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass) +{ + unsigned int expected_commit_id = info->end_transaction; + unsigned long next_fc_block; + struct buffer_head *bh; + int err = 0; + + next_fc_block = journal->j_fc_first; + if (!journal->j_fc_replay_callback) + return 0; + + while (next_fc_block <= journal->j_fc_last) { + jbd_debug(3, "Fast commit replay: next block %ld", + next_fc_block); + err = jread(&bh, journal, next_fc_block); + if (err) { + jbd_debug(3, "Fast commit replay: read error"); + break; + } + + jbd_debug(3, "Processing fast commit blk with seq %d"); + err = journal->j_fc_replay_callback(journal, bh, pass, + next_fc_block - journal->j_fc_first, + expected_commit_id); + next_fc_block++; + if (err < 0 || err == JBD2_FC_REPLAY_STOP) + break; + err = 0; + } + + if (err) + jbd_debug(3, "Fast commit replay failed, err = %d\n", err); + + return err; +} + /** * jbd2_journal_recover - recovers a on-disk journal * @journal: the journal to recover @@ -428,6 +468,8 @@ static int do_one_pass(journal_t *journal, __u32 crc32_sum = ~0; /* Transactional Checksums */ int descr_csum_size = 0; int block_error = 0; + bool need_check_commit_time = false; + __u64 last_trans_commit_time = 0, commit_time; /* * First thing is to establish what we expect to find in the log @@ -470,7 +512,9 @@ static int do_one_pass(journal_t *journal, break; jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", - next_commit_ID, next_log_block, journal->j_last); + next_commit_ID, next_log_block, + jbd2_has_feature_fast_commit(journal) ? + journal->j_fc_last : journal->j_last); /* Skip over each chunk of the transaction looking * either the next descriptor block or the final commit @@ -520,12 +564,21 @@ static int do_one_pass(journal_t *journal, if (descr_csum_size > 0 && !jbd2_descriptor_block_csum_verify(journal, bh->b_data)) { - printk(KERN_ERR "JBD2: Invalid checksum " - "recovering block %lu in log\n", - next_log_block); - err = -EFSBADCRC; - brelse(bh); - goto failed; + /* + * PASS_SCAN can see stale blocks due to lazy + * journal init. Don't error out on those yet. + */ + if (pass != PASS_SCAN) { + pr_err("JBD2: Invalid checksum recovering block %lu in log\n", + next_log_block); + err = -EFSBADCRC; + brelse(bh); + goto failed; + } + need_check_commit_time = true; + jbd_debug(1, + "invalid descriptor block found in %lu\n", + next_log_block); } /* If it is a valid descriptor block, replay it @@ -535,6 +588,7 @@ static int do_one_pass(journal_t *journal, if (pass != PASS_REPLAY) { if (pass == PASS_SCAN && jbd2_has_feature_checksum(journal) && + !need_check_commit_time && !info->end_transaction) { if (calc_chksums(journal, bh, &next_log_block, @@ -683,11 +737,41 @@ static int do_one_pass(journal_t *journal, * mentioned conditions. Hence assume * "Interrupted Commit".) */ + commit_time = be64_to_cpu( + ((struct commit_header *)bh->b_data)->h_commit_sec); + /* + * If need_check_commit_time is set, it means we are in + * PASS_SCAN and csum verify failed before. If + * commit_time is increasing, it's the same journal, + * otherwise it is stale journal block, just end this + * recovery. + */ + if (need_check_commit_time) { + if (commit_time >= last_trans_commit_time) { + pr_err("JBD2: Invalid checksum found in transaction %u\n", + next_commit_ID); + err = -EFSBADCRC; + brelse(bh); + goto failed; + } + ignore_crc_mismatch: + /* + * It likely does not belong to same journal, + * just end this recovery with success. + */ + jbd_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n", + next_commit_ID); + err = 0; + brelse(bh); + goto done; + } - /* Found an expected commit block: if checksums - * are present verify them in PASS_SCAN; else not + /* + * Found an expected commit block: if checksums + * are present, verify them in PASS_SCAN; else not * much to do other than move on to the next sequence - * number. */ + * number. + */ if (pass == PASS_SCAN && jbd2_has_feature_checksum(journal)) { struct commit_header *cbh = @@ -719,6 +803,8 @@ static int do_one_pass(journal_t *journal, !jbd2_commit_block_csum_verify(journal, bh->b_data)) { chksum_error: + if (commit_time < last_trans_commit_time) + goto ignore_crc_mismatch; info->end_transaction = next_commit_ID; if (!jbd2_has_feature_async_commit(journal)) { @@ -728,11 +814,24 @@ static int do_one_pass(journal_t *journal, break; } } + if (pass == PASS_SCAN) + last_trans_commit_time = commit_time; brelse(bh); next_commit_ID++; continue; case JBD2_REVOKE_BLOCK: + /* + * Check revoke block crc in pass_scan, if csum verify + * failed, check commit block time later. + */ + if (pass == PASS_SCAN && + !jbd2_descriptor_block_csum_verify(journal, + bh->b_data)) { + jbd_debug(1, "JBD2: invalid revoke block found in %lu\n", + next_log_block); + need_check_commit_time = true; + } /* If we aren't in the REVOKE pass, then we can * just skip over this block. */ if (pass != PASS_REVOKE) { @@ -777,6 +876,13 @@ static int do_one_pass(journal_t *journal, success = -EIO; } } + + if (jbd2_has_feature_fast_commit(journal) && pass != PASS_REVOKE) { + err = fc_do_one_pass(journal, info, pass); + if (err) + success = err; + } + if (block_error && success == 0) success = -EIO; return success; @@ -800,9 +906,6 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, offset = sizeof(jbd2_journal_revoke_header_t); rcount = be32_to_cpu(header->r_count); - if (!jbd2_descriptor_block_csum_verify(journal, header)) - return -EFSBADCRC; - if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_block_tail); if (rcount > journal->j_blocksize - csum_size) diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 1eabd91870e6..1d9488cf0534 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -417,7 +417,7 @@ void nsm_release(struct nsm_handle *nsm) /* * XDR functions for NSM. * - * See http://www.opengroup.org/ for details on the Network + * See https://www.opengroup.org/ for details on the Network * Status Monitor wire protocol. */ diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index e4d3f783e06a..fa41dda39925 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -486,65 +486,215 @@ nlm4svc_proc_granted_res(struct svc_rqst *rqstp) return rpc_success; } +static __be32 +nlm4svc_proc_unused(struct svc_rqst *rqstp) +{ + return rpc_proc_unavail; +} + /* * NLM Server procedures. */ -#define nlm4svc_encode_norep nlm4svc_encode_void -#define nlm4svc_decode_norep nlm4svc_decode_void -#define nlm4svc_decode_testres nlm4svc_decode_void -#define nlm4svc_decode_lockres nlm4svc_decode_void -#define nlm4svc_decode_unlockres nlm4svc_decode_void -#define nlm4svc_decode_cancelres nlm4svc_decode_void -#define nlm4svc_decode_grantedres nlm4svc_decode_void - -#define nlm4svc_proc_none nlm4svc_proc_null -#define nlm4svc_proc_test_res nlm4svc_proc_null -#define nlm4svc_proc_lock_res nlm4svc_proc_null -#define nlm4svc_proc_cancel_res nlm4svc_proc_null -#define nlm4svc_proc_unlock_res nlm4svc_proc_null - struct nlm_void { int dummy; }; -#define PROC(name, xargt, xrest, argt, rest, respsize) \ - { .pc_func = nlm4svc_proc_##name, \ - .pc_decode = nlm4svc_decode_##xargt, \ - .pc_encode = nlm4svc_encode_##xrest, \ - .pc_release = NULL, \ - .pc_argsize = sizeof(struct nlm_##argt), \ - .pc_ressize = sizeof(struct nlm_##rest), \ - .pc_xdrressize = respsize, \ - } #define Ck (1+XDR_QUADLEN(NLM_MAXCOOKIELEN)) /* cookie */ #define No (1+1024/4) /* netobj */ #define St 1 /* status */ #define Rg 4 /* range (offset + length) */ -const struct svc_procedure nlmsvc_procedures4[] = { - PROC(null, void, void, void, void, 1), - PROC(test, testargs, testres, args, res, Ck+St+2+No+Rg), - PROC(lock, lockargs, res, args, res, Ck+St), - PROC(cancel, cancargs, res, args, res, Ck+St), - PROC(unlock, unlockargs, res, args, res, Ck+St), - PROC(granted, testargs, res, args, res, Ck+St), - PROC(test_msg, testargs, norep, args, void, 1), - PROC(lock_msg, lockargs, norep, args, void, 1), - PROC(cancel_msg, cancargs, norep, args, void, 1), - PROC(unlock_msg, unlockargs, norep, args, void, 1), - PROC(granted_msg, testargs, norep, args, void, 1), - PROC(test_res, testres, norep, res, void, 1), - PROC(lock_res, lockres, norep, res, void, 1), - PROC(cancel_res, cancelres, norep, res, void, 1), - PROC(unlock_res, unlockres, norep, res, void, 1), - PROC(granted_res, res, norep, res, void, 1), - /* statd callback */ - PROC(sm_notify, reboot, void, reboot, void, 1), - PROC(none, void, void, void, void, 0), - PROC(none, void, void, void, void, 0), - PROC(none, void, void, void, void, 0), - PROC(share, shareargs, shareres, args, res, Ck+St+1), - PROC(unshare, shareargs, shareres, args, res, Ck+St+1), - PROC(nm_lock, lockargs, res, args, res, Ck+St), - PROC(free_all, notify, void, args, void, 1), +const struct svc_procedure nlmsvc_procedures4[24] = { + [NLMPROC_NULL] = { + .pc_func = nlm4svc_proc_null, + .pc_decode = nlm4svc_decode_void, + .pc_encode = nlm4svc_encode_void, + .pc_argsize = sizeof(struct nlm_void), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_TEST] = { + .pc_func = nlm4svc_proc_test, + .pc_decode = nlm4svc_decode_testargs, + .pc_encode = nlm4svc_encode_testres, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_res), + .pc_xdrressize = Ck+St+2+No+Rg, + }, + [NLMPROC_LOCK] = { + .pc_func = nlm4svc_proc_lock, + .pc_decode = nlm4svc_decode_lockargs, + .pc_encode = nlm4svc_encode_res, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_res), + .pc_xdrressize = Ck+St, + }, + [NLMPROC_CANCEL] = { + .pc_func = nlm4svc_proc_cancel, + .pc_decode = nlm4svc_decode_cancargs, + .pc_encode = nlm4svc_encode_res, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_res), + .pc_xdrressize = Ck+St, + }, + [NLMPROC_UNLOCK] = { + .pc_func = nlm4svc_proc_unlock, + .pc_decode = nlm4svc_decode_unlockargs, + .pc_encode = nlm4svc_encode_res, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_res), + .pc_xdrressize = Ck+St, + }, + [NLMPROC_GRANTED] = { + .pc_func = nlm4svc_proc_granted, + .pc_decode = nlm4svc_decode_testargs, + .pc_encode = nlm4svc_encode_res, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_res), + .pc_xdrressize = Ck+St, + }, + [NLMPROC_TEST_MSG] = { + .pc_func = nlm4svc_proc_test_msg, + .pc_decode = nlm4svc_decode_testargs, + .pc_encode = nlm4svc_encode_void, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_LOCK_MSG] = { + .pc_func = nlm4svc_proc_lock_msg, + .pc_decode = nlm4svc_decode_lockargs, + .pc_encode = nlm4svc_encode_void, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_CANCEL_MSG] = { + .pc_func = nlm4svc_proc_cancel_msg, + .pc_decode = nlm4svc_decode_cancargs, + .pc_encode = nlm4svc_encode_void, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_UNLOCK_MSG] = { + .pc_func = nlm4svc_proc_unlock_msg, + .pc_decode = nlm4svc_decode_unlockargs, + .pc_encode = nlm4svc_encode_void, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_GRANTED_MSG] = { + .pc_func = nlm4svc_proc_granted_msg, + .pc_decode = nlm4svc_decode_testargs, + .pc_encode = nlm4svc_encode_void, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_TEST_RES] = { + .pc_func = nlm4svc_proc_null, + .pc_decode = nlm4svc_decode_void, + .pc_encode = nlm4svc_encode_void, + .pc_argsize = sizeof(struct nlm_res), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_LOCK_RES] = { + .pc_func = nlm4svc_proc_null, + .pc_decode = nlm4svc_decode_void, + .pc_encode = nlm4svc_encode_void, + .pc_argsize = sizeof(struct nlm_res), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_CANCEL_RES] = { + .pc_func = nlm4svc_proc_null, + .pc_decode = nlm4svc_decode_void, + .pc_encode = nlm4svc_encode_void, + .pc_argsize = sizeof(struct nlm_res), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_UNLOCK_RES] = { + .pc_func = nlm4svc_proc_null, + .pc_decode = nlm4svc_decode_void, + .pc_encode = nlm4svc_encode_void, + .pc_argsize = sizeof(struct nlm_res), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_GRANTED_RES] = { + .pc_func = nlm4svc_proc_granted_res, + .pc_decode = nlm4svc_decode_res, + .pc_encode = nlm4svc_encode_void, + .pc_argsize = sizeof(struct nlm_res), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_NSM_NOTIFY] = { + .pc_func = nlm4svc_proc_sm_notify, + .pc_decode = nlm4svc_decode_reboot, + .pc_encode = nlm4svc_encode_void, + .pc_argsize = sizeof(struct nlm_reboot), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [17] = { + .pc_func = nlm4svc_proc_unused, + .pc_decode = nlm4svc_decode_void, + .pc_encode = nlm4svc_encode_void, + .pc_argsize = sizeof(struct nlm_void), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = 0, + }, + [18] = { + .pc_func = nlm4svc_proc_unused, + .pc_decode = nlm4svc_decode_void, + .pc_encode = nlm4svc_encode_void, + .pc_argsize = sizeof(struct nlm_void), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = 0, + }, + [19] = { + .pc_func = nlm4svc_proc_unused, + .pc_decode = nlm4svc_decode_void, + .pc_encode = nlm4svc_encode_void, + .pc_argsize = sizeof(struct nlm_void), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = 0, + }, + [NLMPROC_SHARE] = { + .pc_func = nlm4svc_proc_share, + .pc_decode = nlm4svc_decode_shareargs, + .pc_encode = nlm4svc_encode_shareres, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_res), + .pc_xdrressize = Ck+St+1, + }, + [NLMPROC_UNSHARE] = { + .pc_func = nlm4svc_proc_unshare, + .pc_decode = nlm4svc_decode_shareargs, + .pc_encode = nlm4svc_encode_shareres, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_res), + .pc_xdrressize = Ck+St+1, + }, + [NLMPROC_NM_LOCK] = { + .pc_func = nlm4svc_proc_nm_lock, + .pc_decode = nlm4svc_decode_lockargs, + .pc_encode = nlm4svc_encode_res, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_res), + .pc_xdrressize = Ck+St, + }, + [NLMPROC_FREE_ALL] = { + .pc_func = nlm4svc_proc_free_all, + .pc_decode = nlm4svc_decode_notify, + .pc_encode = nlm4svc_encode_void, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, }; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index d0bb7a6bf005..50855f2c1f4b 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -529,66 +529,214 @@ nlmsvc_proc_granted_res(struct svc_rqst *rqstp) return rpc_success; } +static __be32 +nlmsvc_proc_unused(struct svc_rqst *rqstp) +{ + return rpc_proc_unavail; +} + /* * NLM Server procedures. */ -#define nlmsvc_encode_norep nlmsvc_encode_void -#define nlmsvc_decode_norep nlmsvc_decode_void -#define nlmsvc_decode_testres nlmsvc_decode_void -#define nlmsvc_decode_lockres nlmsvc_decode_void -#define nlmsvc_decode_unlockres nlmsvc_decode_void -#define nlmsvc_decode_cancelres nlmsvc_decode_void -#define nlmsvc_decode_grantedres nlmsvc_decode_void - -#define nlmsvc_proc_none nlmsvc_proc_null -#define nlmsvc_proc_test_res nlmsvc_proc_null -#define nlmsvc_proc_lock_res nlmsvc_proc_null -#define nlmsvc_proc_cancel_res nlmsvc_proc_null -#define nlmsvc_proc_unlock_res nlmsvc_proc_null - struct nlm_void { int dummy; }; -#define PROC(name, xargt, xrest, argt, rest, respsize) \ - { .pc_func = nlmsvc_proc_##name, \ - .pc_decode = nlmsvc_decode_##xargt, \ - .pc_encode = nlmsvc_encode_##xrest, \ - .pc_release = NULL, \ - .pc_argsize = sizeof(struct nlm_##argt), \ - .pc_ressize = sizeof(struct nlm_##rest), \ - .pc_xdrressize = respsize, \ - } - #define Ck (1+XDR_QUADLEN(NLM_MAXCOOKIELEN)) /* cookie */ #define St 1 /* status */ #define No (1+1024/4) /* Net Obj */ #define Rg 2 /* range - offset + size */ -const struct svc_procedure nlmsvc_procedures[] = { - PROC(null, void, void, void, void, 1), - PROC(test, testargs, testres, args, res, Ck+St+2+No+Rg), - PROC(lock, lockargs, res, args, res, Ck+St), - PROC(cancel, cancargs, res, args, res, Ck+St), - PROC(unlock, unlockargs, res, args, res, Ck+St), - PROC(granted, testargs, res, args, res, Ck+St), - PROC(test_msg, testargs, norep, args, void, 1), - PROC(lock_msg, lockargs, norep, args, void, 1), - PROC(cancel_msg, cancargs, norep, args, void, 1), - PROC(unlock_msg, unlockargs, norep, args, void, 1), - PROC(granted_msg, testargs, norep, args, void, 1), - PROC(test_res, testres, norep, res, void, 1), - PROC(lock_res, lockres, norep, res, void, 1), - PROC(cancel_res, cancelres, norep, res, void, 1), - PROC(unlock_res, unlockres, norep, res, void, 1), - PROC(granted_res, res, norep, res, void, 1), - /* statd callback */ - PROC(sm_notify, reboot, void, reboot, void, 1), - PROC(none, void, void, void, void, 1), - PROC(none, void, void, void, void, 1), - PROC(none, void, void, void, void, 1), - PROC(share, shareargs, shareres, args, res, Ck+St+1), - PROC(unshare, shareargs, shareres, args, res, Ck+St+1), - PROC(nm_lock, lockargs, res, args, res, Ck+St), - PROC(free_all, notify, void, args, void, 0), - +const struct svc_procedure nlmsvc_procedures[24] = { + [NLMPROC_NULL] = { + .pc_func = nlmsvc_proc_null, + .pc_decode = nlmsvc_decode_void, + .pc_encode = nlmsvc_encode_void, + .pc_argsize = sizeof(struct nlm_void), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_TEST] = { + .pc_func = nlmsvc_proc_test, + .pc_decode = nlmsvc_decode_testargs, + .pc_encode = nlmsvc_encode_testres, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_res), + .pc_xdrressize = Ck+St+2+No+Rg, + }, + [NLMPROC_LOCK] = { + .pc_func = nlmsvc_proc_lock, + .pc_decode = nlmsvc_decode_lockargs, + .pc_encode = nlmsvc_encode_res, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_res), + .pc_xdrressize = Ck+St, + }, + [NLMPROC_CANCEL] = { + .pc_func = nlmsvc_proc_cancel, + .pc_decode = nlmsvc_decode_cancargs, + .pc_encode = nlmsvc_encode_res, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_res), + .pc_xdrressize = Ck+St, + }, + [NLMPROC_UNLOCK] = { + .pc_func = nlmsvc_proc_unlock, + .pc_decode = nlmsvc_decode_unlockargs, + .pc_encode = nlmsvc_encode_res, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_res), + .pc_xdrressize = Ck+St, + }, + [NLMPROC_GRANTED] = { + .pc_func = nlmsvc_proc_granted, + .pc_decode = nlmsvc_decode_testargs, + .pc_encode = nlmsvc_encode_res, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_res), + .pc_xdrressize = Ck+St, + }, + [NLMPROC_TEST_MSG] = { + .pc_func = nlmsvc_proc_test_msg, + .pc_decode = nlmsvc_decode_testargs, + .pc_encode = nlmsvc_encode_void, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_LOCK_MSG] = { + .pc_func = nlmsvc_proc_lock_msg, + .pc_decode = nlmsvc_decode_lockargs, + .pc_encode = nlmsvc_encode_void, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_CANCEL_MSG] = { + .pc_func = nlmsvc_proc_cancel_msg, + .pc_decode = nlmsvc_decode_cancargs, + .pc_encode = nlmsvc_encode_void, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_UNLOCK_MSG] = { + .pc_func = nlmsvc_proc_unlock_msg, + .pc_decode = nlmsvc_decode_unlockargs, + .pc_encode = nlmsvc_encode_void, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_GRANTED_MSG] = { + .pc_func = nlmsvc_proc_granted_msg, + .pc_decode = nlmsvc_decode_testargs, + .pc_encode = nlmsvc_encode_void, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_TEST_RES] = { + .pc_func = nlmsvc_proc_null, + .pc_decode = nlmsvc_decode_void, + .pc_encode = nlmsvc_encode_void, + .pc_argsize = sizeof(struct nlm_res), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_LOCK_RES] = { + .pc_func = nlmsvc_proc_null, + .pc_decode = nlmsvc_decode_void, + .pc_encode = nlmsvc_encode_void, + .pc_argsize = sizeof(struct nlm_res), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_CANCEL_RES] = { + .pc_func = nlmsvc_proc_null, + .pc_decode = nlmsvc_decode_void, + .pc_encode = nlmsvc_encode_void, + .pc_argsize = sizeof(struct nlm_res), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_UNLOCK_RES] = { + .pc_func = nlmsvc_proc_null, + .pc_decode = nlmsvc_decode_void, + .pc_encode = nlmsvc_encode_void, + .pc_argsize = sizeof(struct nlm_res), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_GRANTED_RES] = { + .pc_func = nlmsvc_proc_granted_res, + .pc_decode = nlmsvc_decode_res, + .pc_encode = nlmsvc_encode_void, + .pc_argsize = sizeof(struct nlm_res), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_NSM_NOTIFY] = { + .pc_func = nlmsvc_proc_sm_notify, + .pc_decode = nlmsvc_decode_reboot, + .pc_encode = nlmsvc_encode_void, + .pc_argsize = sizeof(struct nlm_reboot), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [17] = { + .pc_func = nlmsvc_proc_unused, + .pc_decode = nlmsvc_decode_void, + .pc_encode = nlmsvc_encode_void, + .pc_argsize = sizeof(struct nlm_void), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [18] = { + .pc_func = nlmsvc_proc_unused, + .pc_decode = nlmsvc_decode_void, + .pc_encode = nlmsvc_encode_void, + .pc_argsize = sizeof(struct nlm_void), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [19] = { + .pc_func = nlmsvc_proc_unused, + .pc_decode = nlmsvc_decode_void, + .pc_encode = nlmsvc_encode_void, + .pc_argsize = sizeof(struct nlm_void), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = St, + }, + [NLMPROC_SHARE] = { + .pc_func = nlmsvc_proc_share, + .pc_decode = nlmsvc_decode_shareargs, + .pc_encode = nlmsvc_encode_shareres, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_res), + .pc_xdrressize = Ck+St+1, + }, + [NLMPROC_UNSHARE] = { + .pc_func = nlmsvc_proc_unshare, + .pc_decode = nlmsvc_decode_shareargs, + .pc_encode = nlmsvc_encode_shareres, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_res), + .pc_xdrressize = Ck+St+1, + }, + [NLMPROC_NM_LOCK] = { + .pc_func = nlmsvc_proc_nm_lock, + .pc_decode = nlmsvc_decode_lockargs, + .pc_encode = nlmsvc_encode_res, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_res), + .pc_xdrressize = Ck+St, + }, + [NLMPROC_FREE_ALL] = { + .pc_func = nlmsvc_proc_free_all, + .pc_decode = nlmsvc_decode_notify, + .pc_encode = nlmsvc_encode_void, + .pc_argsize = sizeof(struct nlm_args), + .pc_ressize = sizeof(struct nlm_void), + .pc_xdrressize = 0, + }, }; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 7b09a9158e40..34f546404aa1 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -383,8 +383,7 @@ static int minix_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = sbi->s_ninodes; buf->f_ffree = minix_count_free_inodes(sb); buf->f_namelen = sbi->s_namelen; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/namei.c b/fs/namei.c index f1eb8ccd2be9..d4a6dd772303 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1626,7 +1626,8 @@ static const char *pick_link(struct nameidata *nd, struct path *link, return ERR_PTR(error); } - if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS)) + if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) || + unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW)) return ERR_PTR(-ELOOP); if (!(nd->flags & LOOKUP_RCU)) { diff --git a/fs/namespace.c b/fs/namespace.c index 294e05a13d17..cebaa3e81794 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1191,7 +1191,7 @@ static void mntput_no_expire(struct mount *mnt) struct task_struct *task = current; if (likely(!(task->flags & PF_KTHREAD))) { init_task_work(&mnt->mnt_rcu, __cleanup_mnt); - if (!task_work_add(task, &mnt->mnt_rcu, true)) + if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME)) return; } if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) @@ -3171,6 +3171,8 @@ int path_mount(const char *dev_name, struct path *path, mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; + if (flags & MS_NOSYMFOLLOW) + mnt_flags |= MNT_NOSYMFOLLOW; /* The default atime for remount is preservation */ if ((flags & MS_REMOUNT) && diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 222afba70bc0..29ec8b09a52d 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -94,6 +94,7 @@ enum { static const struct constant_table nfs_param_enums_local_lock[] = { { "all", Opt_local_lock_all }, { "flock", Opt_local_lock_flock }, + { "posix", Opt_local_lock_posix }, { "none", Opt_local_lock_none }, {} }; diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 6b063227e34e..2bcbe38afe2e 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -32,9 +32,9 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ; /* * nfs_path - reconstruct the path given an arbitrary dentry * @base - used to return pointer to the end of devname part of path - * @dentry - pointer to dentry + * @dentry_in - pointer to dentry * @buffer - result buffer - * @buflen - length of buffer + * @buflen_in - length of buffer * @flags - options (see below) * * Helper function for constructing the server pathname @@ -49,15 +49,19 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ; * the original device (export) name * (if unset, the original name is returned verbatim) */ -char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen, - unsigned flags) +char *nfs_path(char **p, struct dentry *dentry_in, char *buffer, + ssize_t buflen_in, unsigned flags) { char *end; int namelen; unsigned seq; const char *base; + struct dentry *dentry; + ssize_t buflen; rename_retry: + buflen = buflen_in; + dentry = dentry_in; end = buffer+buflen; *--end = '\0'; buflen--; diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index 86777996cfec..b51424ff8159 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -67,7 +67,6 @@ struct nfs4_xattr_bucket { struct nfs4_xattr_cache { struct kref ref; - spinlock_t hash_lock; /* protects hashtable and lru */ struct nfs4_xattr_bucket buckets[NFS4_XATTR_HASH_SIZE]; struct list_head lru; struct list_head dispose; @@ -882,7 +881,7 @@ nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc) { unsigned long count; - count = list_lru_count(&nfs4_xattr_cache_lru); + count = list_lru_shrink_count(&nfs4_xattr_cache_lru, sc); return vfs_pressure_ratio(count); } @@ -976,7 +975,7 @@ nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc) lru = (shrink == &nfs4_xattr_large_entry_shrinker) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; - count = list_lru_count(lru); + count = list_lru_shrink_count(lru, sc); return vfs_pressure_ratio(count); } diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index cc50085e151c..0dc31ad2362e 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -45,6 +45,15 @@ #define encode_deallocate_maxsz (op_encode_hdr_maxsz + \ encode_fallocate_maxsz) #define decode_deallocate_maxsz (op_decode_hdr_maxsz) +#define encode_read_plus_maxsz (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + 3) +#define NFS42_READ_PLUS_SEGMENT_SIZE (1 /* data_content4 */ + \ + 2 /* data_info4.di_offset */ + \ + 2 /* data_info4.di_length */) +#define decode_read_plus_maxsz (op_decode_hdr_maxsz + \ + 1 /* rpr_eof */ + \ + 1 /* rpr_contents count */ + \ + 2 * NFS42_READ_PLUS_SEGMENT_SIZE) #define encode_seek_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + \ 2 /* offset */ + \ @@ -128,6 +137,14 @@ decode_putfh_maxsz + \ decode_deallocate_maxsz + \ decode_getattr_maxsz) +#define NFS4_enc_read_plus_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_read_plus_maxsz) +#define NFS4_dec_read_plus_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_read_plus_maxsz) #define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -324,6 +341,16 @@ static void encode_deallocate(struct xdr_stream *xdr, encode_fallocate(xdr, args); } +static void encode_read_plus(struct xdr_stream *xdr, + const struct nfs_pgio_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_READ_PLUS, decode_read_plus_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->stateid); + encode_uint64(xdr, args->offset); + encode_uint32(xdr, args->count); +} + static void encode_seek(struct xdr_stream *xdr, const struct nfs42_seek_args *args, struct compound_hdr *hdr) @@ -723,6 +750,28 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, } /* + * Encode READ_PLUS request + */ +static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs_pgio_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_read_plus(xdr, args, &hdr); + + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->count, hdr.replen); + encode_nops(&hdr); +} + +/* * Encode SEEK request */ static void nfs4_xdr_enc_seek(struct rpc_rqst *req, @@ -970,6 +1019,97 @@ static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re return decode_op_hdr(xdr, OP_DEALLOCATE); } +static int decode_read_plus_data(struct xdr_stream *xdr, struct nfs_pgio_res *res, + uint32_t *eof) +{ + uint32_t count, recvd; + uint64_t offset; + __be32 *p; + + p = xdr_inline_decode(xdr, 8 + 4); + if (unlikely(!p)) + return -EIO; + + p = xdr_decode_hyper(p, &offset); + count = be32_to_cpup(p); + recvd = xdr_align_data(xdr, res->count, count); + res->count += recvd; + + if (count > recvd) { + dprintk("NFS: server cheating in read reply: " + "count %u > recvd %u\n", count, recvd); + *eof = 0; + return 1; + } + + return 0; +} + +static int decode_read_plus_hole(struct xdr_stream *xdr, struct nfs_pgio_res *res, + uint32_t *eof) +{ + uint64_t offset, length, recvd; + __be32 *p; + + p = xdr_inline_decode(xdr, 8 + 8); + if (unlikely(!p)) + return -EIO; + + p = xdr_decode_hyper(p, &offset); + p = xdr_decode_hyper(p, &length); + recvd = xdr_expand_hole(xdr, res->count, length); + res->count += recvd; + + if (recvd < length) { + *eof = 0; + return 1; + } + return 0; +} + +static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) +{ + uint32_t eof, segments, type; + int status, i; + __be32 *p; + + status = decode_op_hdr(xdr, OP_READ_PLUS); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(!p)) + return -EIO; + + eof = be32_to_cpup(p++); + segments = be32_to_cpup(p++); + if (segments == 0) + goto out; + + for (i = 0; i < segments; i++) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + + type = be32_to_cpup(p++); + if (type == NFS4_CONTENT_DATA) + status = decode_read_plus_data(xdr, res, &eof); + else if (type == NFS4_CONTENT_HOLE) + status = decode_read_plus_hole(xdr, res, &eof); + else + return -EINVAL; + + if (status < 0) + return status; + if (status > 0) + break; + } + +out: + res->eof = eof; + return 0; +} + static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res) { int status; @@ -1147,6 +1287,33 @@ out: } /* + * Decode READ_PLUS request + */ +static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs_pgio_res *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_read_plus(xdr, res); + if (!status) + status = res->count; +out: + return status; +} + +/* * Decode SEEK request */ static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 0c9505dc852c..065cb04222a1 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -599,6 +599,14 @@ static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stat return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0; } +static inline bool nfs4_stateid_is_next(const nfs4_stateid *s1, const nfs4_stateid *s2) +{ + u32 seq1 = be32_to_cpu(s1->seqid); + u32 seq2 = be32_to_cpu(s2->seqid); + + return seq2 == seq1 + 1U || (seq2 == 1U && seq1 == 0xffffffffU); +} + static inline bool nfs4_stateid_match_or_older(const nfs4_stateid *dst, const nfs4_stateid *src) { return nfs4_stateid_match_other(dst, src) && diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index daacc78a3d48..be7915c861ce 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1045,6 +1045,8 @@ static int nfs4_server_common_setup(struct nfs_server *server, server->caps |= server->nfs_client->cl_mvops->init_caps; if (server->flags & NFS_MOUNT_NORDIRPLUS) server->caps &= ~NFS_CAP_READDIRPLUS; + if (server->nfs_client->cl_proto == XPRT_TRANSPORT_RDMA) + server->caps &= ~NFS_CAP_READ_PLUS; /* * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower * authentication. diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index fdfc77486ace..9d354de613da 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -9,6 +9,7 @@ #include <linux/falloc.h> #include <linux/mount.h> #include <linux/nfs_fs.h> +#include <linux/nfs_ssc.h> #include "delegation.h" #include "internal.h" #include "iostat.h" @@ -145,7 +146,8 @@ static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in, /* Only offload copy if superblock is the same */ if (file_in->f_op != &nfs4_file_operations) return -EXDEV; - if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY)) + if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY) || + !nfs_server_capable(file_inode(file_in), NFS_CAP_COPY)) return -EOPNOTSUPP; if (file_inode(file_in) == file_inode(file_out)) return -EOPNOTSUPP; @@ -314,9 +316,8 @@ out: static int read_name_gen = 1; #define SSC_READ_NAME_BODY "ssc_read_%d" -struct file * -nfs42_ssc_open(struct vfsmount *ss_mnt, struct nfs_fh *src_fh, - nfs4_stateid *stateid) +static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, + struct nfs_fh *src_fh, nfs4_stateid *stateid) { struct nfs_fattr fattr; struct file *filep, *res; @@ -398,14 +399,40 @@ out_filep: fput(filep); goto out_free_name; } -EXPORT_SYMBOL_GPL(nfs42_ssc_open); -void nfs42_ssc_close(struct file *filep) + +static void __nfs42_ssc_close(struct file *filep) { struct nfs_open_context *ctx = nfs_file_open_context(filep); ctx->state->flags = 0; } -EXPORT_SYMBOL_GPL(nfs42_ssc_close); + +static const struct nfs4_ssc_client_ops nfs4_ssc_clnt_ops_tbl = { + .sco_open = __nfs42_ssc_open, + .sco_close = __nfs42_ssc_close, +}; + +/** + * nfs42_ssc_register_ops - Wrapper to register NFS_V4 ops in nfs_common + * + * Return values: + * None + */ +void nfs42_ssc_register_ops(void) +{ + nfs42_ssc_register(&nfs4_ssc_clnt_ops_tbl); +} + +/** + * nfs42_ssc_unregister_ops - wrapper to un-register NFS_V4 ops in nfs_common + * + * Return values: + * None. + */ +void nfs42_ssc_unregister_ops(void) +{ + nfs42_ssc_unregister(&nfs4_ssc_clnt_ops_tbl); +} #endif /* CONFIG_NFS_V4_2 */ const struct file_operations nfs4_file_operations = { diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 62e6eea5c516..8d8aba305ecc 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -46,6 +46,7 @@ #include <keys/user-type.h> #include <keys/request_key_auth-type.h> #include <linux/module.h> +#include <linux/user_namespace.h> #include "internal.h" #include "netns.h" @@ -69,13 +70,13 @@ struct idmap { struct rpc_pipe *idmap_pipe; struct idmap_legacy_upcalldata *idmap_upcall_data; struct mutex idmap_mutex; - const struct cred *cred; + struct user_namespace *user_ns; }; static struct user_namespace *idmap_userns(const struct idmap *idmap) { - if (idmap && idmap->cred) - return idmap->cred->user_ns; + if (idmap && idmap->user_ns) + return idmap->user_ns; return &init_user_ns; } @@ -286,7 +287,7 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen, if (ret < 0) return ERR_PTR(ret); - if (!idmap->cred || idmap->cred->user_ns == &init_user_ns) + if (!idmap->user_ns || idmap->user_ns == &init_user_ns) rkey = request_key(&key_type_id_resolver, desc, ""); if (IS_ERR(rkey)) { mutex_lock(&idmap->idmap_mutex); @@ -462,7 +463,7 @@ nfs_idmap_new(struct nfs_client *clp) return -ENOMEM; mutex_init(&idmap->idmap_mutex); - idmap->cred = get_cred(clp->cl_rpcclient->cl_cred); + idmap->user_ns = get_user_ns(clp->cl_rpcclient->cl_cred->user_ns); rpc_init_pipe_dir_object(&idmap->idmap_pdo, &nfs_idmap_pipe_dir_object_ops, @@ -486,7 +487,7 @@ nfs_idmap_new(struct nfs_client *clp) err_destroy_pipe: rpc_destroy_pipe_data(idmap->idmap_pipe); err: - put_cred(idmap->cred); + get_user_ns(idmap->user_ns); kfree(idmap); return error; } @@ -503,7 +504,7 @@ nfs_idmap_delete(struct nfs_client *clp) &clp->cl_rpcclient->cl_pipedir_objects, &idmap->idmap_pdo); rpc_destroy_pipe_data(idmap->idmap_pipe); - put_cred(idmap->cred); + put_user_ns(idmap->user_ns); kfree(idmap); } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6e95c85fe395..9e0ca9b2b210 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -63,6 +63,7 @@ #include "callback.h" #include "pnfs.h" #include "netns.h" +#include "sysfs.h" #include "nfs4idmap.h" #include "nfs4session.h" #include "fscache.h" @@ -70,6 +71,10 @@ #include "nfs4trace.h" +#ifdef CONFIG_NFS_V4_2 +#include "nfs42.h" +#endif /* CONFIG_NFS_V4_2 */ + #define NFSDBG_FACILITY NFSDBG_PROC #define NFS4_BITMASK_SZ 3 @@ -107,6 +112,9 @@ static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *, const struct cred *, bool); #endif +static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode, + struct nfs_server *server, + struct nfs4_label *label); #ifdef CONFIG_NFS_V4_SECURITY_LABEL static inline struct nfs4_label * @@ -1547,19 +1555,6 @@ static void nfs_state_log_update_open_stateid(struct nfs4_state *state) wake_up_all(&state->waitq); } -static void nfs_state_log_out_of_order_open_stateid(struct nfs4_state *state, - const nfs4_stateid *stateid) -{ - u32 state_seqid = be32_to_cpu(state->open_stateid.seqid); - u32 stateid_seqid = be32_to_cpu(stateid->seqid); - - if (stateid_seqid == state_seqid + 1U || - (stateid_seqid == 1U && state_seqid == 0xffffffffU)) - nfs_state_log_update_open_stateid(state); - else - set_bit(NFS_STATE_CHANGE_WAIT, &state->flags); -} - static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) { struct nfs_client *clp = state->owner->so_server->nfs_client; @@ -1585,21 +1580,19 @@ static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) * i.e. The stateid seqids have to be initialised to 1, and * are then incremented on every state transition. */ -static bool nfs_need_update_open_stateid(struct nfs4_state *state, +static bool nfs_stateid_is_sequential(struct nfs4_state *state, const nfs4_stateid *stateid) { - if (test_bit(NFS_OPEN_STATE, &state->flags) == 0 || - !nfs4_stateid_match_other(stateid, &state->open_stateid)) { + if (test_bit(NFS_OPEN_STATE, &state->flags)) { + /* The common case - we're updating to a new sequence number */ + if (nfs4_stateid_match_other(stateid, &state->open_stateid) && + nfs4_stateid_is_next(&state->open_stateid, stateid)) { + return true; + } + } else { + /* This is the first OPEN in this generation */ if (stateid->seqid == cpu_to_be32(1)) - nfs_state_log_update_open_stateid(state); - else - set_bit(NFS_STATE_CHANGE_WAIT, &state->flags); - return true; - } - - if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) { - nfs_state_log_out_of_order_open_stateid(state, stateid); - return true; + return true; } return false; } @@ -1673,16 +1666,16 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, int status = 0; for (;;) { - if (!nfs_need_update_open_stateid(state, stateid)) - return; - if (!test_bit(NFS_STATE_CHANGE_WAIT, &state->flags)) + if (nfs_stateid_is_sequential(state, stateid)) break; + if (status) break; /* Rely on seqids for serialisation with NFSv4.0 */ if (!nfs4_has_session(NFS_SERVER(state->inode)->nfs_client)) break; + set_bit(NFS_STATE_CHANGE_WAIT, &state->flags); prepare_to_wait(&state->waitq, &wait, TASK_KILLABLE); /* * Ensure we process the state changes in the same order @@ -1693,6 +1686,7 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, spin_unlock(&state->owner->so_lock); rcu_read_unlock(); trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0); + if (!signal_pending(current)) { if (schedule_timeout(5*HZ) == 0) status = -EAGAIN; @@ -3435,7 +3429,8 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst, __be32 seqid_open; u32 dst_seqid; bool ret; - int seq; + int seq, status = -EAGAIN; + DEFINE_WAIT(wait); for (;;) { ret = false; @@ -3447,15 +3442,41 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst, continue; break; } + + write_seqlock(&state->seqlock); seqid_open = state->open_stateid.seqid; - if (read_seqretry(&state->seqlock, seq)) - continue; dst_seqid = be32_to_cpu(dst->seqid); - if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) >= 0) - dst->seqid = cpu_to_be32(dst_seqid + 1); - else + + /* Did another OPEN bump the state's seqid? try again: */ + if ((s32)(be32_to_cpu(seqid_open) - dst_seqid) > 0) { dst->seqid = seqid_open; + write_sequnlock(&state->seqlock); + ret = true; + break; + } + + /* server says we're behind but we haven't seen the update yet */ + set_bit(NFS_STATE_CHANGE_WAIT, &state->flags); + prepare_to_wait(&state->waitq, &wait, TASK_KILLABLE); + write_sequnlock(&state->seqlock); + trace_nfs4_close_stateid_update_wait(state->inode, dst, 0); + + if (signal_pending(current)) + status = -EINTR; + else + if (schedule_timeout(5*HZ) != 0) + status = 0; + + finish_wait(&state->waitq, &wait); + + if (!status) + continue; + if (status == -EINTR) + break; + + /* we slept the whole 5 seconds, we must have lost a seqid */ + dst->seqid = cpu_to_be32(dst_seqid + 1); ret = true; break; } @@ -3632,9 +3653,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (calldata->arg.fmode == 0 || calldata->arg.fmode == FMODE_READ) { /* Close-to-open cache consistency revalidation */ - if (!nfs4_have_delegation(inode, FMODE_READ)) + if (!nfs4_have_delegation(inode, FMODE_READ)) { calldata->arg.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; - else + nfs4_bitmask_adjust(calldata->arg.bitmask, inode, NFS_SERVER(inode), NULL); + } else calldata->arg.bitmask = NULL; } @@ -5255,28 +5277,60 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task, return true; } -static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) +static bool nfs4_read_plus_not_supported(struct rpc_task *task, + struct nfs_pgio_header *hdr) { + struct nfs_server *server = NFS_SERVER(hdr->inode); + struct rpc_message *msg = &task->tk_msg; + if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS] && + server->caps & NFS_CAP_READ_PLUS && task->tk_status == -ENOTSUPP) { + server->caps &= ~NFS_CAP_READ_PLUS; + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + rpc_restart_call_prepare(task); + return true; + } + return false; +} + +static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) +{ dprintk("--> %s\n", __func__); if (!nfs4_sequence_done(task, &hdr->res.seq_res)) return -EAGAIN; if (nfs4_read_stateid_changed(task, &hdr->args)) return -EAGAIN; + if (nfs4_read_plus_not_supported(task, hdr)) + return -EAGAIN; if (task->tk_status > 0) nfs_invalidate_atime(hdr->inode); return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) : nfs4_read_done_cb(task, hdr); } +#ifdef CONFIG_NFS_V4_2 +static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg) +{ + if (server->caps & NFS_CAP_READ_PLUS) + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS]; + else + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; +} +#else +static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg) +{ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; +} +#endif /* CONFIG_NFS_V4_2 */ + static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr, struct rpc_message *msg) { hdr->timestamp = jiffies; if (!hdr->pgio_done_cb) hdr->pgio_done_cb = nfs4_read_done_cb; - msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + nfs42_read_plus_support(NFS_SERVER(hdr->inode), msg); nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0); } @@ -5360,6 +5414,38 @@ bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr) return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0; } +static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode, + struct nfs_server *server, + struct nfs4_label *label) +{ + + unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); + + if ((cache_validity & NFS_INO_INVALID_DATA) || + (cache_validity & NFS_INO_REVAL_PAGECACHE) || + (cache_validity & NFS_INO_REVAL_FORCED) || + (cache_validity & NFS_INO_INVALID_OTHER)) + nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, label), inode); + + if (cache_validity & NFS_INO_INVALID_ATIME) + bitmask[1] |= FATTR4_WORD1_TIME_ACCESS; + if (cache_validity & NFS_INO_INVALID_ACCESS) + bitmask[0] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | + FATTR4_WORD1_OWNER_GROUP; + if (cache_validity & NFS_INO_INVALID_ACL) + bitmask[0] |= FATTR4_WORD0_ACL; + if (cache_validity & NFS_INO_INVALID_LABEL) + bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL; + if (cache_validity & NFS_INO_INVALID_CTIME) + bitmask[0] |= FATTR4_WORD0_CHANGE; + if (cache_validity & NFS_INO_INVALID_MTIME) + bitmask[1] |= FATTR4_WORD1_TIME_MODIFY; + if (cache_validity & NFS_INO_INVALID_SIZE) + bitmask[0] |= FATTR4_WORD0_SIZE; + if (cache_validity & NFS_INO_INVALID_BLOCKS) + bitmask[1] |= FATTR4_WORD1_SPACE_USED; +} + static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, struct rpc_message *msg, struct rpc_clnt **clnt) @@ -5369,8 +5455,10 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, if (!nfs4_write_need_cache_consistency_data(hdr)) { hdr->args.bitmask = NULL; hdr->res.fattr = NULL; - } else + } else { hdr->args.bitmask = server->cache_consistency_bitmask; + nfs4_bitmask_adjust(hdr->args.bitmask, hdr->inode, server, NULL); + } if (!hdr->pgio_done_cb) hdr->pgio_done_cb = nfs4_write_done_cb; @@ -6006,9 +6094,34 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, memcpy(bootverf->data, verf, sizeof(bootverf->data)); } +static size_t +nfs4_get_uniquifier(struct nfs_client *clp, char *buf, size_t buflen) +{ + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + struct nfs_netns_client *nn_clp = nn->nfs_client; + const char *id; + + buf[0] = '\0'; + + if (nn_clp) { + rcu_read_lock(); + id = rcu_dereference(nn_clp->identifier); + if (id) + strscpy(buf, id, buflen); + rcu_read_unlock(); + } + + if (nfs4_client_id_uniquifier[0] != '\0' && buf[0] == '\0') + strscpy(buf, nfs4_client_id_uniquifier, buflen); + + return strlen(buf); +} + static int nfs4_init_nonuniform_client_string(struct nfs_client *clp) { + char buf[NFS4_CLIENT_ID_UNIQ_LEN]; + size_t buflen; size_t len; char *str; @@ -6022,8 +6135,11 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) + 1; rcu_read_unlock(); - if (nfs4_client_id_uniquifier[0] != '\0') - len += strlen(nfs4_client_id_uniquifier) + 1; + + buflen = nfs4_get_uniquifier(clp, buf, sizeof(buf)); + if (buflen) + len += buflen + 1; + if (len > NFS4_OPAQUE_LIMIT + 1) return -EINVAL; @@ -6037,10 +6153,9 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) return -ENOMEM; rcu_read_lock(); - if (nfs4_client_id_uniquifier[0] != '\0') + if (buflen) scnprintf(str, len, "Linux NFSv4.0 %s/%s/%s", - clp->cl_rpcclient->cl_nodename, - nfs4_client_id_uniquifier, + clp->cl_rpcclient->cl_nodename, buf, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); else @@ -6055,50 +6170,23 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) } static int -nfs4_init_uniquifier_client_string(struct nfs_client *clp) -{ - size_t len; - char *str; - - len = 10 + 10 + 1 + 10 + 1 + - strlen(nfs4_client_id_uniquifier) + 1 + - strlen(clp->cl_rpcclient->cl_nodename) + 1; - - if (len > NFS4_OPAQUE_LIMIT + 1) - return -EINVAL; - - /* - * Since this string is allocated at mount time, and held until the - * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying - * about a memory-reclaim deadlock. - */ - str = kmalloc(len, GFP_KERNEL); - if (!str) - return -ENOMEM; - - scnprintf(str, len, "Linux NFSv%u.%u %s/%s", - clp->rpc_ops->version, clp->cl_minorversion, - nfs4_client_id_uniquifier, - clp->cl_rpcclient->cl_nodename); - clp->cl_owner_id = str; - return 0; -} - -static int nfs4_init_uniform_client_string(struct nfs_client *clp) { + char buf[NFS4_CLIENT_ID_UNIQ_LEN]; + size_t buflen; size_t len; char *str; if (clp->cl_owner_id != NULL) return 0; - if (nfs4_client_id_uniquifier[0] != '\0') - return nfs4_init_uniquifier_client_string(clp); - len = 10 + 10 + 1 + 10 + 1 + strlen(clp->cl_rpcclient->cl_nodename) + 1; + buflen = nfs4_get_uniquifier(clp, buf, sizeof(buf)); + if (buflen) + len += buflen + 1; + if (len > NFS4_OPAQUE_LIMIT + 1) return -EINVAL; @@ -6111,9 +6199,14 @@ nfs4_init_uniform_client_string(struct nfs_client *clp) if (!str) return -ENOMEM; - scnprintf(str, len, "Linux NFSv%u.%u %s", - clp->rpc_ops->version, clp->cl_minorversion, - clp->cl_rpcclient->cl_nodename); + if (buflen) + scnprintf(str, len, "Linux NFSv%u.%u %s/%s", + clp->rpc_ops->version, clp->cl_minorversion, + buf, clp->cl_rpcclient->cl_nodename); + else + scnprintf(str, len, "Linux NFSv%u.%u %s", + clp->rpc_ops->version, clp->cl_minorversion, + clp->cl_rpcclient->cl_nodename); clp->cl_owner_id = str; return 0; } @@ -6406,6 +6499,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, data->args.fhandle = &data->fh; data->args.stateid = &data->stateid; data->args.bitmask = server->cache_consistency_bitmask; + nfs4_bitmask_adjust(data->args.bitmask, inode, server, NULL); nfs_copy_fh(&data->fh, NFS_FH(inode)); nfs4_stateid_copy(&data->stateid, stateid); data->res.fattr = &data->fattr; @@ -7440,7 +7534,7 @@ nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len) if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) { len = security_inode_listsecurity(inode, list, list_len); - if (list_len && len > list_len) + if (len >= 0 && list_len && len > list_len) return -ERANGE; } return len; @@ -8039,9 +8133,11 @@ int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or * DS flags set. */ -static int nfs4_check_cl_exchange_flags(u32 flags) +static int nfs4_check_cl_exchange_flags(u32 flags, u32 version) { - if (flags & ~EXCHGID4_FLAG_MASK_R) + if (version >= 2 && (flags & ~EXCHGID4_2_FLAG_MASK_R)) + goto out_inval; + else if (version < 2 && (flags & ~EXCHGID4_FLAG_MASK_R)) goto out_inval; if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) && (flags & EXCHGID4_FLAG_USE_NON_PNFS)) @@ -8454,7 +8550,8 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cre if (status != 0) goto out; - status = nfs4_check_cl_exchange_flags(resp->flags); + status = nfs4_check_cl_exchange_flags(resp->flags, + clp->cl_mvops->minor_version); if (status != 0) goto out; @@ -9693,7 +9790,6 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_argp = &args, .rpc_resp = &res, }; - struct rpc_clnt *clnt = server->client; struct nfs4_call_sync_data data = { .seq_server = server, .seq_args = &args.seq_args, @@ -9710,8 +9806,7 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, int status; if (use_integrity) { - clnt = server->nfs_client->cl_rpcclient; - task_setup.rpc_client = clnt; + task_setup.rpc_client = server->nfs_client->cl_rpcclient; cred = nfs4_get_clid_cred(server->nfs_client); msg.rpc_cred = cred; @@ -10165,7 +10260,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_SEEK | NFS_CAP_LAYOUTSTATS | NFS_CAP_CLONE - | NFS_CAP_LAYOUTERROR, + | NFS_CAP_LAYOUTERROR + | NFS_CAP_READ_PLUS, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 0c1ab846b83d..93f5c1678ec2 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -7,6 +7,7 @@ #include <linux/mount.h> #include <linux/nfs4_mount.h> #include <linux/nfs_fs.h> +#include <linux/nfs_ssc.h> #include "delegation.h" #include "internal.h" #include "nfs4_fs.h" @@ -279,6 +280,9 @@ static int __init init_nfs_v4(void) if (err) goto out2; +#ifdef CONFIG_NFS_V4_2 + nfs42_ssc_register_ops(); +#endif register_nfs_version(&nfs_v4); return 0; out2: @@ -297,6 +301,7 @@ static void __exit exit_nfs_v4(void) unregister_nfs_version(&nfs_v4); #ifdef CONFIG_NFS_V4_2 nfs4_xattr_cache_exit(); + nfs42_ssc_unregister_ops(); #endif nfs4_unregister_sysctl(); nfs_idmap_quit(); diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index b4f852d4d099..484c1da96dea 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -1511,6 +1511,7 @@ DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr); DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn); DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update); DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update_wait); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_close_stateid_update_wait); DECLARE_EVENT_CLASS(nfs4_getattr_event, TP_PROTO( diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 0b3510f62623..c6dbfcae7517 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -5308,7 +5308,6 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, uint32_t attrlen, bitmap[3] = {0}; int status; - unsigned int pg_offset; res->acl_len = 0; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -5316,9 +5315,6 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, xdr_enter_page(xdr, xdr->buf->page_len); - /* Calculate the offset of the page data */ - pg_offset = xdr->buf->head[0].iov_len; - if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) goto out; if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) @@ -5331,7 +5327,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, /* The bitmap (xdr len + bitmaps) and the attr xdr len words * are stored with the acl data to handle the problem of * variable length bitmaps.*/ - res->acl_data_offset = xdr_stream_pos(xdr) - pg_offset; + res->acl_data_offset = xdr_page_pos(xdr); res->acl_len = attrlen; /* Check for receive buffer overflow */ @@ -7619,6 +7615,7 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC42(SETXATTR, enc_setxattr, dec_setxattr), PROC42(LISTXATTRS, enc_listxattrs, dec_listxattrs), PROC42(REMOVEXATTR, enc_removexattr, dec_removexattr), + PROC42(READ_PLUS, enc_read_plus, dec_read_plus), }; static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 71f7741126b6..0e50b9d45c32 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -902,7 +902,7 @@ restart: } /* - * Called by the state manger to remove all layouts established under an + * Called by the state manager to remove all layouts established under an * expired lease. */ void diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f943e37853fa..4034102010f0 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -57,6 +57,7 @@ #include <linux/rcupdate.h> #include <linux/uaccess.h> +#include <linux/nfs_ssc.h> #include "nfs4_fs.h" #include "callback.h" @@ -85,6 +86,10 @@ const struct super_operations nfs_sops = { }; EXPORT_SYMBOL_GPL(nfs_sops); +static const struct nfs_ssc_client_ops nfs_ssc_clnt_ops_tbl = { + .sco_sb_deactive = nfs_sb_deactive, +}; + #if IS_ENABLED(CONFIG_NFS_V4) static int __init register_nfs4_fs(void) { @@ -106,6 +111,16 @@ static void unregister_nfs4_fs(void) } #endif +static void nfs_ssc_register_ops(void) +{ + nfs_ssc_register(&nfs_ssc_clnt_ops_tbl); +} + +static void nfs_ssc_unregister_ops(void) +{ + nfs_ssc_unregister(&nfs_ssc_clnt_ops_tbl); +} + static struct shrinker acl_shrinker = { .count_objects = nfs_access_cache_count, .scan_objects = nfs_access_cache_scan, @@ -133,6 +148,7 @@ int __init register_nfs_fs(void) ret = register_shrinker(&acl_shrinker); if (ret < 0) goto error_3; + nfs_ssc_register_ops(); return 0; error_3: nfs_unregister_sysctl(); @@ -152,6 +168,7 @@ void __exit unregister_nfs_fs(void) unregister_shrinker(&acl_shrinker); nfs_unregister_sysctl(); unregister_nfs4_fs(); + nfs_ssc_unregister_ops(); unregister_filesystem(&nfs_fs_type); } @@ -889,7 +906,7 @@ static struct nfs_server *nfs_try_mount_request(struct fs_context *fc) default: if (rpcauth_get_gssinfo(flavor, &info) != 0) continue; - /* Fallthrough */ + break; } dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor); ctx->selected_flavor = flavor; diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index c489496b5659..8cb70755e3c9 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -79,7 +79,12 @@ static ssize_t nfs_netns_identifier_show(struct kobject *kobj, struct nfs_netns_client *c = container_of(kobj, struct nfs_netns_client, kobject); - return scnprintf(buf, PAGE_SIZE, "%s\n", c->identifier); + ssize_t ret; + + rcu_read_lock(); + ret = scnprintf(buf, PAGE_SIZE, "%s\n", rcu_dereference(c->identifier)); + rcu_read_unlock(); + return ret; } /* Strip trailing '\n' */ @@ -107,7 +112,7 @@ static ssize_t nfs_netns_identifier_store(struct kobject *kobj, p = kmemdup_nul(buf, len, GFP_KERNEL); if (!p) return -ENOMEM; - old = xchg(&c->identifier, p); + old = rcu_dereference_protected(xchg(&c->identifier, (char __rcu *)p), 1); if (old) { synchronize_rcu(); kfree(old); @@ -121,7 +126,7 @@ static void nfs_netns_client_release(struct kobject *kobj) struct nfs_netns_client, kobject); - kfree(c->identifier); + kfree(rcu_dereference_raw(c->identifier)); kfree(c); } diff --git a/fs/nfs/sysfs.h b/fs/nfs/sysfs.h index ebcbdc40483b..5501ef573c32 100644 --- a/fs/nfs/sysfs.h +++ b/fs/nfs/sysfs.h @@ -11,7 +11,7 @@ struct nfs_netns_client { struct kobject kobject; struct net *net; - const char *identifier; + const char __rcu *identifier; }; extern struct kobject *nfs_client_kobj; diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile index 4bebe834c009..fa82f5aaa6d9 100644 --- a/fs/nfs_common/Makefile +++ b/fs/nfs_common/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o nfs_acl-objs := nfsacl.o obj-$(CONFIG_GRACE_PERIOD) += grace.o +obj-$(CONFIG_GRACE_PERIOD) += nfs_ssc.o diff --git a/fs/nfs_common/nfs_ssc.c b/fs/nfs_common/nfs_ssc.c new file mode 100644 index 000000000000..f43bbb373913 --- /dev/null +++ b/fs/nfs_common/nfs_ssc.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * fs/nfs_common/nfs_ssc_comm.c + * + * Helper for knfsd's SSC to access ops in NFS client modules + * + * Author: Dai Ngo <dai.ngo@oracle.com> + * + * Copyright (c) 2020, Oracle and/or its affiliates. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/nfs_ssc.h> +#include "../nfs/nfs4_fs.h" + +MODULE_LICENSE("GPL"); + +struct nfs_ssc_client_ops_tbl nfs_ssc_client_tbl; +EXPORT_SYMBOL_GPL(nfs_ssc_client_tbl); + +#ifdef CONFIG_NFS_V4_2 +/** + * nfs42_ssc_register - install the NFS_V4 client ops in the nfs_ssc_client_tbl + * @ops: NFS_V4 ops to be installed + * + * Return values: + * None + */ +void nfs42_ssc_register(const struct nfs4_ssc_client_ops *ops) +{ + nfs_ssc_client_tbl.ssc_nfs4_ops = ops; +} +EXPORT_SYMBOL_GPL(nfs42_ssc_register); + +/** + * nfs42_ssc_unregister - uninstall the NFS_V4 client ops from + * the nfs_ssc_client_tbl + * @ops: ops to be uninstalled + * + * Return values: + * None + */ +void nfs42_ssc_unregister(const struct nfs4_ssc_client_ops *ops) +{ + if (nfs_ssc_client_tbl.ssc_nfs4_ops != ops) + return; + + nfs_ssc_client_tbl.ssc_nfs4_ops = NULL; +} +EXPORT_SYMBOL_GPL(nfs42_ssc_unregister); +#endif /* CONFIG_NFS_V4_2 */ + +#ifdef CONFIG_NFS_V4_2 +/** + * nfs_ssc_register - install the NFS_FS client ops in the nfs_ssc_client_tbl + * @ops: NFS_FS ops to be installed + * + * Return values: + * None + */ +void nfs_ssc_register(const struct nfs_ssc_client_ops *ops) +{ + nfs_ssc_client_tbl.ssc_nfs_ops = ops; +} +EXPORT_SYMBOL_GPL(nfs_ssc_register); + +/** + * nfs_ssc_unregister - uninstall the NFS_FS client ops from + * the nfs_ssc_client_tbl + * @ops: ops to be uninstalled + * + * Return values: + * None + */ +void nfs_ssc_unregister(const struct nfs_ssc_client_ops *ops) +{ + if (nfs_ssc_client_tbl.ssc_nfs_ops != ops) + return; + nfs_ssc_client_tbl.ssc_nfs_ops = NULL; +} +EXPORT_SYMBOL_GPL(nfs_ssc_unregister); + +#else +void nfs_ssc_register(const struct nfs_ssc_client_ops *ops) +{ +} +EXPORT_SYMBOL_GPL(nfs_ssc_register); + +void nfs_ssc_unregister(const struct nfs_ssc_client_ops *ops) +{ +} +EXPORT_SYMBOL_GPL(nfs_ssc_unregister); +#endif /* CONFIG_NFS_V4_2 */ diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 99d2cae91bd6..dbbc583d6273 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -136,7 +136,7 @@ config NFSD_FLEXFILELAYOUT config NFSD_V4_2_INTER_SSC bool "NFSv4.2 inter server to server COPY" - depends on NFSD_V4 && NFS_V4_1 && NFS_V4_2 && NFS_FS=y + depends on NFSD_V4 && NFS_V4_1 && NFS_V4_2 help This option enables support for NFSv4.2 inter server to server copy where the destination server calls the NFSv4.2 @@ -156,13 +156,3 @@ config NFSD_V4_SECURITY_LABEL If you do not wish to enable fine-grained security labels SELinux or Smack policies on NFSv4 files, say N. - -config NFSD_FAULT_INJECTION - bool "NFS server manual fault injection" - depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS && BROKEN - help - This option enables support for manually injecting faults - into the NFS server. This is intended to be used for - testing error recovery on the NFS client. - - If unsure, say N. diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 6a40b1afe703..3f0983e93a99 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -13,7 +13,6 @@ nfsd-y += trace.o nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ export.o auth.o lockd.o nfscache.o nfsxdr.o \ stats.o filecache.o -nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index cb777fe82988..21e404e7cb68 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1002,7 +1002,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) if (nfsd4_spo_must_allow(rqstp)) return 0; - return nfserr_wrongsec; + return rqstp->rq_vers < 4 ? nfserr_acces : nfserr_wrongsec; } /* diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index c8b9d2667ee6..3c6c2f7d1688 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -889,7 +889,7 @@ nfsd_file_find_locked(struct inode *inode, unsigned int may_flags, hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head, nf_node, lockdep_is_held(&nfsd_file_hashtbl[hashval].nfb_lock)) { - if ((need & nf->nf_may) != need) + if (nf->nf_may != need) continue; if (nf->nf_inode != inode) continue; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index cbab1d2d8a75..6a900f770dd2 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -14,7 +14,6 @@ #include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC -#define RETURN_STATUS(st) { resp->status = (st); return (st); } /* * NULL call. @@ -22,7 +21,7 @@ static __be32 nfsacld_proc_null(struct svc_rqst *rqstp) { - return nfs_ok; + return rpc_success; } /* @@ -35,24 +34,25 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp) struct posix_acl *acl; struct inode *inode; svc_fh *fh; - __be32 nfserr = 0; dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); fh = fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); - if (nfserr) - RETURN_STATUS(nfserr); + resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (resp->status != nfs_ok) + goto out; inode = d_inode(fh->fh_dentry); - if (argp->mask & ~NFS_ACL_MASK) - RETURN_STATUS(nfserr_inval); + if (argp->mask & ~NFS_ACL_MASK) { + resp->status = nfserr_inval; + goto out; + } resp->mask = argp->mask; - nfserr = fh_getattr(fh, &resp->stat); - if (nfserr) - RETURN_STATUS(nfserr); + resp->status = fh_getattr(fh, &resp->stat); + if (resp->status != nfs_ok) + goto out; if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { acl = get_acl(inode, ACL_TYPE_ACCESS); @@ -61,7 +61,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp) acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); } if (IS_ERR(acl)) { - nfserr = nfserrno(PTR_ERR(acl)); + resp->status = nfserrno(PTR_ERR(acl)); goto fail; } resp->acl_access = acl; @@ -71,19 +71,20 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp) of a non-directory! */ acl = get_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) { - nfserr = nfserrno(PTR_ERR(acl)); + resp->status = nfserrno(PTR_ERR(acl)); goto fail; } resp->acl_default = acl; } /* resp->acl_{access,default} are released in nfssvc_release_getacl. */ - RETURN_STATUS(0); +out: + return rpc_success; fail: posix_acl_release(resp->acl_access); posix_acl_release(resp->acl_default); - RETURN_STATUS(nfserr); + goto out; } /* @@ -95,14 +96,13 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp) struct nfsd_attrstat *resp = rqstp->rq_resp; struct inode *inode; svc_fh *fh; - __be32 nfserr = 0; int error; dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); fh = fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); - if (nfserr) + resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); + if (resp->status != nfs_ok) goto out; inode = d_inode(fh->fh_dentry); @@ -124,19 +124,20 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp) fh_drop_write(fh); - nfserr = fh_getattr(fh, &resp->stat); + resp->status = fh_getattr(fh, &resp->stat); out: /* argp->acl_{access,default} may have been allocated in nfssvc_decode_setaclargs. */ posix_acl_release(argp->acl_access); posix_acl_release(argp->acl_default); - return nfserr; + return rpc_success; + out_drop_lock: fh_unlock(fh); fh_drop_write(fh); out_errno: - nfserr = nfserrno(error); + resp->status = nfserrno(error); goto out; } @@ -147,15 +148,16 @@ static __be32 nfsacld_proc_getattr(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_attrstat *resp = rqstp->rq_resp; - __be32 nfserr; + dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); - if (nfserr) - return nfserr; - nfserr = fh_getattr(&resp->fh, &resp->stat); - return nfserr; + resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (resp->status != nfs_ok) + goto out; + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; } /* @@ -165,7 +167,6 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp) { struct nfsd3_accessargs *argp = rqstp->rq_argp; struct nfsd3_accessres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: ACCESS(2acl) %s 0x%x\n", SVCFH_fmt(&argp->fh), @@ -173,16 +174,22 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->access = argp->access; - nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL); - if (nfserr) - return nfserr; - nfserr = fh_getattr(&resp->fh, &resp->stat); - return nfserr; + resp->status = nfsd_access(rqstp, &resp->fh, &resp->access, NULL); + if (resp->status != nfs_ok) + goto out; + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; } /* * XDR decode functions */ +static int nfsaclsvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p) +{ + return 1; +} + static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_getaclargs *argp = rqstp->rq_argp; @@ -268,6 +275,10 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) int n; int w; + *p++ = resp->status; + if (resp->status != nfs_ok) + return xdr_ressize_check(rqstp, p); + /* * Since this is version 2, the check for nfserr in * nfsd_dispatch actually ensures the following cannot happen. @@ -307,7 +318,12 @@ static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd_attrstat *resp = rqstp->rq_resp; + *p++ = resp->status; + if (resp->status != nfs_ok) + goto out; + p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); +out: return xdr_ressize_check(rqstp, p); } @@ -316,8 +332,13 @@ static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_accessres *resp = rqstp->rq_resp; + *p++ = resp->status; + if (resp->status != nfs_ok) + goto out; + p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); *p++ = htonl(resp->access); +out: return xdr_ressize_check(rqstp, p); } @@ -347,36 +368,63 @@ static void nfsaclsvc_release_access(struct svc_rqst *rqstp) fh_put(&resp->fh); } -#define nfsaclsvc_decode_voidargs NULL -#define nfsaclsvc_release_void NULL -#define nfsd3_fhandleargs nfsd_fhandle -#define nfsd3_attrstatres nfsd_attrstat -#define nfsd3_voidres nfsd3_voidargs struct nfsd3_voidargs { int dummy; }; -#define PROC(name, argt, rest, relt, cache, respsize) \ -{ \ - .pc_func = nfsacld_proc_##name, \ - .pc_decode = nfsaclsvc_decode_##argt##args, \ - .pc_encode = nfsaclsvc_encode_##rest##res, \ - .pc_release = nfsaclsvc_release_##relt, \ - .pc_argsize = sizeof(struct nfsd3_##argt##args), \ - .pc_ressize = sizeof(struct nfsd3_##rest##res), \ - .pc_cachetype = cache, \ - .pc_xdrressize = respsize, \ -} - #define ST 1 /* status*/ #define AT 21 /* attributes */ #define pAT (1+AT) /* post attributes - conditional */ #define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */ -static const struct svc_procedure nfsd_acl_procedures2[] = { - PROC(null, void, void, void, RC_NOCACHE, ST), - PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)), - PROC(setacl, setacl, attrstat, attrstat, RC_NOCACHE, ST+AT), - PROC(getattr, fhandle, attrstat, attrstat, RC_NOCACHE, ST+AT), - PROC(access, access, access, access, RC_NOCACHE, ST+AT+1), +static const struct svc_procedure nfsd_acl_procedures2[5] = { + [ACLPROC2_NULL] = { + .pc_func = nfsacld_proc_null, + .pc_decode = nfsaclsvc_decode_voidarg, + .pc_encode = nfsaclsvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd3_voidargs), + .pc_ressize = sizeof(struct nfsd3_voidargs), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + }, + [ACLPROC2_GETACL] = { + .pc_func = nfsacld_proc_getacl, + .pc_decode = nfsaclsvc_decode_getaclargs, + .pc_encode = nfsaclsvc_encode_getaclres, + .pc_release = nfsaclsvc_release_getacl, + .pc_argsize = sizeof(struct nfsd3_getaclargs), + .pc_ressize = sizeof(struct nfsd3_getaclres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+1+2*(1+ACL), + }, + [ACLPROC2_SETACL] = { + .pc_func = nfsacld_proc_setacl, + .pc_decode = nfsaclsvc_decode_setaclargs, + .pc_encode = nfsaclsvc_encode_attrstatres, + .pc_release = nfsaclsvc_release_attrstat, + .pc_argsize = sizeof(struct nfsd3_setaclargs), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT, + }, + [ACLPROC2_GETATTR] = { + .pc_func = nfsacld_proc_getattr, + .pc_decode = nfsaclsvc_decode_fhandleargs, + .pc_encode = nfsaclsvc_encode_attrstatres, + .pc_release = nfsaclsvc_release_attrstat, + .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT, + }, + [ACLPROC2_ACCESS] = { + .pc_func = nfsacld_proc_access, + .pc_decode = nfsaclsvc_decode_accessargs, + .pc_encode = nfsaclsvc_encode_accessres, + .pc_release = nfsaclsvc_release_access, + .pc_argsize = sizeof(struct nfsd3_accessargs), + .pc_ressize = sizeof(struct nfsd3_accessres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT+1, + }, }; static unsigned int nfsd_acl_count2[ARRAY_SIZE(nfsd_acl_procedures2)]; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 13bca4a2f89d..34a394e50e1d 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -13,15 +13,13 @@ #include "xdr3.h" #include "vfs.h" -#define RETURN_STATUS(st) { resp->status = (st); return (st); } - /* * NULL call. */ static __be32 nfsd3_proc_null(struct svc_rqst *rqstp) { - return nfs_ok; + return rpc_success; } /* @@ -34,17 +32,18 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp) struct posix_acl *acl; struct inode *inode; svc_fh *fh; - __be32 nfserr = 0; fh = fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); - if (nfserr) - RETURN_STATUS(nfserr); + resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (resp->status != nfs_ok) + goto out; inode = d_inode(fh->fh_dentry); - if (argp->mask & ~NFS_ACL_MASK) - RETURN_STATUS(nfserr_inval); + if (argp->mask & ~NFS_ACL_MASK) { + resp->status = nfserr_inval; + goto out; + } resp->mask = argp->mask; if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { @@ -54,7 +53,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp) acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); } if (IS_ERR(acl)) { - nfserr = nfserrno(PTR_ERR(acl)); + resp->status = nfserrno(PTR_ERR(acl)); goto fail; } resp->acl_access = acl; @@ -64,19 +63,20 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp) of a non-directory! */ acl = get_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) { - nfserr = nfserrno(PTR_ERR(acl)); + resp->status = nfserrno(PTR_ERR(acl)); goto fail; } resp->acl_default = acl; } /* resp->acl_{access,default} are released in nfs3svc_release_getacl. */ - RETURN_STATUS(0); +out: + return rpc_success; fail: posix_acl_release(resp->acl_access); posix_acl_release(resp->acl_default); - RETURN_STATUS(nfserr); + goto out; } /* @@ -88,12 +88,11 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp) struct nfsd3_attrstat *resp = rqstp->rq_resp; struct inode *inode; svc_fh *fh; - __be32 nfserr = 0; int error; fh = fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); - if (nfserr) + resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); + if (resp->status != nfs_ok) goto out; inode = d_inode(fh->fh_dentry); @@ -113,13 +112,13 @@ out_drop_lock: fh_unlock(fh); fh_drop_write(fh); out_errno: - nfserr = nfserrno(error); + resp->status = nfserrno(error); out: /* argp->acl_{access,default} may have been allocated in nfs3svc_decode_setaclargs. */ posix_acl_release(argp->acl_access); posix_acl_release(argp->acl_default); - RETURN_STATUS(nfserr); + return rpc_success; } /* @@ -174,6 +173,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) struct nfsd3_getaclres *resp = rqstp->rq_resp; struct dentry *dentry = resp->fh.fh_dentry; + *p++ = resp->status; p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); if (resp->status == 0 && dentry && d_really_is_positive(dentry)) { struct inode *inode = d_inode(dentry); @@ -218,8 +218,8 @@ static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_attrstat *resp = rqstp->rq_resp; + *p++ = resp->status; p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); - return xdr_ressize_check(rqstp, p); } @@ -235,33 +235,43 @@ static void nfs3svc_release_getacl(struct svc_rqst *rqstp) posix_acl_release(resp->acl_default); } -#define nfs3svc_decode_voidargs NULL -#define nfs3svc_release_void NULL -#define nfsd3_setaclres nfsd3_attrstat -#define nfsd3_voidres nfsd3_voidargs struct nfsd3_voidargs { int dummy; }; -#define PROC(name, argt, rest, relt, cache, respsize) \ -{ \ - .pc_func = nfsd3_proc_##name, \ - .pc_decode = nfs3svc_decode_##argt##args, \ - .pc_encode = nfs3svc_encode_##rest##res, \ - .pc_release = nfs3svc_release_##relt, \ - .pc_argsize = sizeof(struct nfsd3_##argt##args), \ - .pc_ressize = sizeof(struct nfsd3_##rest##res), \ - .pc_cachetype = cache, \ - .pc_xdrressize = respsize, \ -} - #define ST 1 /* status*/ #define AT 21 /* attributes */ #define pAT (1+AT) /* post attributes - conditional */ #define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */ -static const struct svc_procedure nfsd_acl_procedures3[] = { - PROC(null, void, void, void, RC_NOCACHE, ST), - PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)), - PROC(setacl, setacl, setacl, fhandle, RC_NOCACHE, ST+pAT), +static const struct svc_procedure nfsd_acl_procedures3[3] = { + [ACLPROC3_NULL] = { + .pc_func = nfsd3_proc_null, + .pc_decode = nfs3svc_decode_voidarg, + .pc_encode = nfs3svc_encode_voidres, + .pc_argsize = sizeof(struct nfsd3_voidargs), + .pc_ressize = sizeof(struct nfsd3_voidargs), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + }, + [ACLPROC3_GETACL] = { + .pc_func = nfsd3_proc_getacl, + .pc_decode = nfs3svc_decode_getaclargs, + .pc_encode = nfs3svc_encode_getaclres, + .pc_release = nfs3svc_release_getacl, + .pc_argsize = sizeof(struct nfsd3_getaclargs), + .pc_ressize = sizeof(struct nfsd3_getaclres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+1+2*(1+ACL), + }, + [ACLPROC3_SETACL] = { + .pc_func = nfsd3_proc_setacl, + .pc_decode = nfs3svc_decode_setaclargs, + .pc_encode = nfs3svc_encode_setaclres, + .pc_release = nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_setaclargs), + .pc_ressize = sizeof(struct nfsd3_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT, + }, }; static unsigned int nfsd_acl_count3[ARRAY_SIZE(nfsd_acl_procedures3)]; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 288bc76b4574..14468613d150 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -15,8 +15,6 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC -#define RETURN_STATUS(st) { resp->status = (st); return (st); } - static int nfs3_ftypes[] = { 0, /* NF3NON */ S_IFREG, /* NF3REG */ @@ -34,7 +32,7 @@ static int nfs3_ftypes[] = { static __be32 nfsd3_proc_null(struct svc_rqst *rqstp) { - return nfs_ok; + return rpc_success; } /* @@ -45,20 +43,19 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_attrstat *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: GETATTR(3) %s\n", SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, - NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); - if (nfserr) - RETURN_STATUS(nfserr); - - nfserr = fh_getattr(&resp->fh, &resp->stat); - - RETURN_STATUS(nfserr); + resp->status = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); + if (resp->status != nfs_ok) + goto out; + + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; } /* @@ -69,15 +66,14 @@ nfsd3_proc_setattr(struct svc_rqst *rqstp) { struct nfsd3_sattrargs *argp = rqstp->rq_argp; struct nfsd3_attrstat *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: SETATTR(3) %s\n", SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs, - argp->check_guard, argp->guardtime); - RETURN_STATUS(nfserr); + resp->status = nfsd_setattr(rqstp, &resp->fh, &argp->attrs, + argp->check_guard, argp->guardtime); + return rpc_success; } /* @@ -88,7 +84,6 @@ nfsd3_proc_lookup(struct svc_rqst *rqstp) { struct nfsd3_diropargs *argp = rqstp->rq_argp; struct nfsd3_diropres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: LOOKUP(3) %s %.*s\n", SVCFH_fmt(&argp->fh), @@ -98,11 +93,10 @@ nfsd3_proc_lookup(struct svc_rqst *rqstp) fh_copy(&resp->dirfh, &argp->fh); fh_init(&resp->fh, NFS3_FHSIZE); - nfserr = nfsd_lookup(rqstp, &resp->dirfh, - argp->name, - argp->len, - &resp->fh); - RETURN_STATUS(nfserr); + resp->status = nfsd_lookup(rqstp, &resp->dirfh, + argp->name, argp->len, + &resp->fh); + return rpc_success; } /* @@ -113,7 +107,6 @@ nfsd3_proc_access(struct svc_rqst *rqstp) { struct nfsd3_accessargs *argp = rqstp->rq_argp; struct nfsd3_accessres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: ACCESS(3) %s 0x%x\n", SVCFH_fmt(&argp->fh), @@ -121,8 +114,8 @@ nfsd3_proc_access(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->access = argp->access; - nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL); - RETURN_STATUS(nfserr); + resp->status = nfsd_access(rqstp, &resp->fh, &resp->access, NULL); + return rpc_success; } /* @@ -133,15 +126,14 @@ nfsd3_proc_readlink(struct svc_rqst *rqstp) { struct nfsd3_readlinkargs *argp = rqstp->rq_argp; struct nfsd3_readlinkres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh)); /* Read the symlink. */ fh_copy(&resp->fh, &argp->fh); resp->len = NFS3_MAXPATHLEN; - nfserr = nfsd_readlink(rqstp, &resp->fh, argp->buffer, &resp->len); - RETURN_STATUS(nfserr); + resp->status = nfsd_readlink(rqstp, &resp->fh, argp->buffer, &resp->len); + return rpc_success; } /* @@ -152,7 +144,6 @@ nfsd3_proc_read(struct svc_rqst *rqstp) { struct nfsd3_readargs *argp = rqstp->rq_argp; struct nfsd3_readres *resp = rqstp->rq_resp; - __be32 nfserr; u32 max_blocksize = svc_max_payload(rqstp); unsigned long cnt = min(argp->count, max_blocksize); @@ -169,12 +160,10 @@ nfsd3_proc_read(struct svc_rqst *rqstp) svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); fh_copy(&resp->fh, &argp->fh); - nfserr = nfsd_read(rqstp, &resp->fh, - argp->offset, - rqstp->rq_vec, argp->vlen, - &resp->count, - &resp->eof); - RETURN_STATUS(nfserr); + resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, + rqstp->rq_vec, argp->vlen, &resp->count, + &resp->eof); + return rpc_success; } /* @@ -185,7 +174,6 @@ nfsd3_proc_write(struct svc_rqst *rqstp) { struct nfsd3_writeargs *argp = rqstp->rq_argp; struct nfsd3_writeres *resp = rqstp->rq_resp; - __be32 nfserr; unsigned long cnt = argp->len; unsigned int nvecs; @@ -199,13 +187,16 @@ nfsd3_proc_write(struct svc_rqst *rqstp) resp->committed = argp->stable; nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages, &argp->first, cnt); - if (!nvecs) - RETURN_STATUS(nfserr_io); - nfserr = nfsd_write(rqstp, &resp->fh, argp->offset, - rqstp->rq_vec, nvecs, &cnt, - resp->committed, resp->verf); + if (!nvecs) { + resp->status = nfserr_io; + goto out; + } + resp->status = nfsd_write(rqstp, &resp->fh, argp->offset, + rqstp->rq_vec, nvecs, &cnt, + resp->committed, resp->verf); resp->count = cnt; - RETURN_STATUS(nfserr); +out: + return rpc_success; } /* @@ -220,7 +211,6 @@ nfsd3_proc_create(struct svc_rqst *rqstp) struct nfsd3_diropres *resp = rqstp->rq_resp; svc_fh *dirfhp, *newfhp = NULL; struct iattr *attr; - __be32 nfserr; dprintk("nfsd: CREATE(3) %s %.*s\n", SVCFH_fmt(&argp->fh), @@ -241,11 +231,10 @@ nfsd3_proc_create(struct svc_rqst *rqstp) } /* Now create the file and set attributes */ - nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len, - attr, newfhp, - argp->createmode, (u32 *)argp->verf, NULL, NULL); - - RETURN_STATUS(nfserr); + resp->status = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len, + attr, newfhp, argp->createmode, + (u32 *)argp->verf, NULL, NULL); + return rpc_success; } /* @@ -256,7 +245,6 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp) { struct nfsd3_createargs *argp = rqstp->rq_argp; struct nfsd3_diropres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: MKDIR(3) %s %.*s\n", SVCFH_fmt(&argp->fh), @@ -266,10 +254,10 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp) argp->attrs.ia_valid &= ~ATTR_SIZE; fh_copy(&resp->dirfh, &argp->fh); fh_init(&resp->fh, NFS3_FHSIZE); - nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, - &argp->attrs, S_IFDIR, 0, &resp->fh); + resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, + &argp->attrs, S_IFDIR, 0, &resp->fh); fh_unlock(&resp->dirfh); - RETURN_STATUS(nfserr); + return rpc_success; } static __be32 @@ -277,18 +265,23 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp) { struct nfsd3_symlinkargs *argp = rqstp->rq_argp; struct nfsd3_diropres *resp = rqstp->rq_resp; - __be32 nfserr; - if (argp->tlen == 0) - RETURN_STATUS(nfserr_inval); - if (argp->tlen > NFS3_MAXPATHLEN) - RETURN_STATUS(nfserr_nametoolong); + if (argp->tlen == 0) { + resp->status = nfserr_inval; + goto out; + } + if (argp->tlen > NFS3_MAXPATHLEN) { + resp->status = nfserr_nametoolong; + goto out; + } argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first, page_address(rqstp->rq_arg.pages[0]), argp->tlen); - if (IS_ERR(argp->tname)) - RETURN_STATUS(nfserrno(PTR_ERR(argp->tname))); + if (IS_ERR(argp->tname)) { + resp->status = nfserrno(PTR_ERR(argp->tname)); + goto out; + } dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n", SVCFH_fmt(&argp->ffh), @@ -297,10 +290,11 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp) fh_copy(&resp->dirfh, &argp->ffh); fh_init(&resp->fh, NFS3_FHSIZE); - nfserr = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, argp->flen, - argp->tname, &resp->fh); + resp->status = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, + argp->flen, argp->tname, &resp->fh); kfree(argp->tname); - RETURN_STATUS(nfserr); +out: + return rpc_success; } /* @@ -311,7 +305,6 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp) { struct nfsd3_mknodargs *argp = rqstp->rq_argp; struct nfsd3_diropres *resp = rqstp->rq_resp; - __be32 nfserr; int type; dev_t rdev = 0; @@ -323,22 +316,28 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp) fh_copy(&resp->dirfh, &argp->fh); fh_init(&resp->fh, NFS3_FHSIZE); - if (argp->ftype == 0 || argp->ftype >= NF3BAD) - RETURN_STATUS(nfserr_inval); + if (argp->ftype == 0 || argp->ftype >= NF3BAD) { + resp->status = nfserr_inval; + goto out; + } if (argp->ftype == NF3CHR || argp->ftype == NF3BLK) { rdev = MKDEV(argp->major, argp->minor); if (MAJOR(rdev) != argp->major || - MINOR(rdev) != argp->minor) - RETURN_STATUS(nfserr_inval); - } else - if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO) - RETURN_STATUS(nfserr_inval); + MINOR(rdev) != argp->minor) { + resp->status = nfserr_inval; + goto out; + } + } else if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO) { + resp->status = nfserr_inval; + goto out; + } type = nfs3_ftypes[argp->ftype]; - nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, - &argp->attrs, type, rdev, &resp->fh); + resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, + &argp->attrs, type, rdev, &resp->fh); fh_unlock(&resp->dirfh); - RETURN_STATUS(nfserr); +out: + return rpc_success; } /* @@ -349,7 +348,6 @@ nfsd3_proc_remove(struct svc_rqst *rqstp) { struct nfsd3_diropargs *argp = rqstp->rq_argp; struct nfsd3_attrstat *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: REMOVE(3) %s %.*s\n", SVCFH_fmt(&argp->fh), @@ -358,9 +356,10 @@ nfsd3_proc_remove(struct svc_rqst *rqstp) /* Unlink. -S_IFDIR means file must not be a directory */ fh_copy(&resp->fh, &argp->fh); - nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len); + resp->status = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, + argp->name, argp->len); fh_unlock(&resp->fh); - RETURN_STATUS(nfserr); + return rpc_success; } /* @@ -371,7 +370,6 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp) { struct nfsd3_diropargs *argp = rqstp->rq_argp; struct nfsd3_attrstat *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: RMDIR(3) %s %.*s\n", SVCFH_fmt(&argp->fh), @@ -379,9 +377,10 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp) argp->name); fh_copy(&resp->fh, &argp->fh); - nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len); + resp->status = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, + argp->name, argp->len); fh_unlock(&resp->fh); - RETURN_STATUS(nfserr); + return rpc_success; } static __be32 @@ -389,7 +388,6 @@ nfsd3_proc_rename(struct svc_rqst *rqstp) { struct nfsd3_renameargs *argp = rqstp->rq_argp; struct nfsd3_renameres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: RENAME(3) %s %.*s ->\n", SVCFH_fmt(&argp->ffh), @@ -402,9 +400,9 @@ nfsd3_proc_rename(struct svc_rqst *rqstp) fh_copy(&resp->ffh, &argp->ffh); fh_copy(&resp->tfh, &argp->tfh); - nfserr = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen, - &resp->tfh, argp->tname, argp->tlen); - RETURN_STATUS(nfserr); + resp->status = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen, + &resp->tfh, argp->tname, argp->tlen); + return rpc_success; } static __be32 @@ -412,7 +410,6 @@ nfsd3_proc_link(struct svc_rqst *rqstp) { struct nfsd3_linkargs *argp = rqstp->rq_argp; struct nfsd3_linkres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: LINK(3) %s ->\n", SVCFH_fmt(&argp->ffh)); @@ -423,9 +420,9 @@ nfsd3_proc_link(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->ffh); fh_copy(&resp->tfh, &argp->tfh); - nfserr = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen, - &resp->fh); - RETURN_STATUS(nfserr); + resp->status = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen, + &resp->fh); + return rpc_success; } /* @@ -436,7 +433,6 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) { struct nfsd3_readdirargs *argp = rqstp->rq_argp; struct nfsd3_readdirres *resp = rqstp->rq_resp; - __be32 nfserr; int count = 0; struct page **p; caddr_t page_addr = NULL; @@ -456,8 +452,8 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) resp->common.err = nfs_ok; resp->buffer = argp->buffer; resp->rqstp = rqstp; - nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t*) &argp->cookie, - &resp->common, nfs3svc_encode_entry); + resp->status = nfsd_readdir(rqstp, &resp->fh, (loff_t *)&argp->cookie, + &resp->common, nfs3svc_encode_entry); memcpy(resp->verf, argp->verf, 8); count = 0; for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) { @@ -485,7 +481,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) resp->offset = NULL; } - RETURN_STATUS(nfserr); + return rpc_success; } /* @@ -497,7 +493,6 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) { struct nfsd3_readdirargs *argp = rqstp->rq_argp; struct nfsd3_readdirres *resp = rqstp->rq_resp; - __be32 nfserr; int count = 0; loff_t offset; struct page **p; @@ -520,17 +515,17 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) resp->rqstp = rqstp; offset = argp->cookie; - nfserr = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP); - if (nfserr) - RETURN_STATUS(nfserr); + resp->status = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP); + if (resp->status != nfs_ok) + goto out; - if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS) - RETURN_STATUS(nfserr_notsupp); + if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS) { + resp->status = nfserr_notsupp; + goto out; + } - nfserr = nfsd_readdir(rqstp, &resp->fh, - &offset, - &resp->common, - nfs3svc_encode_entry_plus); + resp->status = nfsd_readdir(rqstp, &resp->fh, &offset, + &resp->common, nfs3svc_encode_entry_plus); memcpy(resp->verf, argp->verf, 8); for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) { page_addr = page_address(*p); @@ -555,7 +550,8 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) resp->offset = NULL; } - RETURN_STATUS(nfserr); +out: + return rpc_success; } /* @@ -566,14 +562,13 @@ nfsd3_proc_fsstat(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_fsstatres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: FSSTAT(3) %s\n", SVCFH_fmt(&argp->fh)); - nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0); + resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0); fh_put(&argp->fh); - RETURN_STATUS(nfserr); + return rpc_success; } /* @@ -584,7 +579,6 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_fsinfores *resp = rqstp->rq_resp; - __be32 nfserr; u32 max_blocksize = svc_max_payload(rqstp); dprintk("nfsd: FSINFO(3) %s\n", @@ -600,13 +594,13 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp) resp->f_maxfilesize = ~(u32) 0; resp->f_properties = NFS3_FSF_DEFAULT; - nfserr = fh_verify(rqstp, &argp->fh, 0, - NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); + resp->status = fh_verify(rqstp, &argp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); /* Check special features of the file system. May request * different read/write sizes for file systems known to have * problems with large blocks */ - if (nfserr == 0) { + if (resp->status == nfs_ok) { struct super_block *sb = argp->fh.fh_dentry->d_sb; /* Note that we don't care for remote fs's here */ @@ -617,7 +611,7 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp) } fh_put(&argp->fh); - RETURN_STATUS(nfserr); + return rpc_success; } /* @@ -628,7 +622,6 @@ nfsd3_proc_pathconf(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_pathconfres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: PATHCONF(3) %s\n", SVCFH_fmt(&argp->fh)); @@ -641,9 +634,9 @@ nfsd3_proc_pathconf(struct svc_rqst *rqstp) resp->p_case_insensitive = 0; resp->p_case_preserving = 1; - nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); + resp->status = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); - if (nfserr == 0) { + if (resp->status == nfs_ok) { struct super_block *sb = argp->fh.fh_dentry->d_sb; /* Note that we don't care for remote fs's here */ @@ -660,10 +653,9 @@ nfsd3_proc_pathconf(struct svc_rqst *rqstp) } fh_put(&argp->fh); - RETURN_STATUS(nfserr); + return rpc_success; } - /* * Commit a file (range) to stable storage. */ @@ -672,21 +664,22 @@ nfsd3_proc_commit(struct svc_rqst *rqstp) { struct nfsd3_commitargs *argp = rqstp->rq_argp; struct nfsd3_commitres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: COMMIT(3) %s %u@%Lu\n", SVCFH_fmt(&argp->fh), argp->count, (unsigned long long) argp->offset); - if (argp->offset > NFS_OFFSET_MAX) - RETURN_STATUS(nfserr_inval); + if (argp->offset > NFS_OFFSET_MAX) { + resp->status = nfserr_inval; + goto out; + } fh_copy(&resp->fh, &argp->fh); - nfserr = nfsd_commit(rqstp, &resp->fh, argp->offset, argp->count, - resp->verf); - - RETURN_STATUS(nfserr); + resp->status = nfsd_commit(rqstp, &resp->fh, argp->offset, + argp->count, resp->verf); +out: + return rpc_success; } @@ -716,6 +709,7 @@ struct nfsd3_voidargs { int dummy; }; static const struct svc_procedure nfsd_procedures3[22] = { [NFS3PROC_NULL] = { .pc_func = nfsd3_proc_null, + .pc_decode = nfs3svc_decode_voidarg, .pc_encode = nfs3svc_encode_voidres, .pc_argsize = sizeof(struct nfsd3_voidargs), .pc_ressize = sizeof(struct nfsd3_voidres), diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index aae514d40b64..9c23b6acf234 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -305,6 +305,12 @@ void fill_post_wcc(struct svc_fh *fhp) * XDR decode functions */ int +nfs3svc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p) +{ + return 1; +} + +int nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p) { struct nfsd_fhandle *args = rqstp->rq_argp; @@ -635,10 +641,7 @@ nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p) /* * XDR encode functions */ -/* - * There must be an encoding function for void results so svc_process - * will work properly. - */ + int nfs3svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p) { @@ -651,6 +654,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_attrstat *resp = rqstp->rq_resp; + *p++ = resp->status; if (resp->status == 0) { lease_get_mtime(d_inode(resp->fh.fh_dentry), &resp->stat.mtime); @@ -665,6 +669,7 @@ nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_attrstat *resp = rqstp->rq_resp; + *p++ = resp->status; p = encode_wcc_data(rqstp, p, &resp->fh); return xdr_ressize_check(rqstp, p); } @@ -675,6 +680,7 @@ nfs3svc_encode_diropres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_diropres *resp = rqstp->rq_resp; + *p++ = resp->status; if (resp->status == 0) { p = encode_fh(p, &resp->fh); p = encode_post_op_attr(rqstp, p, &resp->fh); @@ -689,6 +695,7 @@ nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_accessres *resp = rqstp->rq_resp; + *p++ = resp->status; p = encode_post_op_attr(rqstp, p, &resp->fh); if (resp->status == 0) *p++ = htonl(resp->access); @@ -701,6 +708,7 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_readlinkres *resp = rqstp->rq_resp; + *p++ = resp->status; p = encode_post_op_attr(rqstp, p, &resp->fh); if (resp->status == 0) { *p++ = htonl(resp->len); @@ -723,6 +731,7 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_readres *resp = rqstp->rq_resp; + *p++ = resp->status; p = encode_post_op_attr(rqstp, p, &resp->fh); if (resp->status == 0) { *p++ = htonl(resp->count); @@ -748,6 +757,7 @@ nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_writeres *resp = rqstp->rq_resp; + *p++ = resp->status; p = encode_wcc_data(rqstp, p, &resp->fh); if (resp->status == 0) { *p++ = htonl(resp->count); @@ -764,6 +774,7 @@ nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_diropres *resp = rqstp->rq_resp; + *p++ = resp->status; if (resp->status == 0) { *p++ = xdr_one; p = encode_fh(p, &resp->fh); @@ -779,6 +790,7 @@ nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_renameres *resp = rqstp->rq_resp; + *p++ = resp->status; p = encode_wcc_data(rqstp, p, &resp->ffh); p = encode_wcc_data(rqstp, p, &resp->tfh); return xdr_ressize_check(rqstp, p); @@ -790,6 +802,7 @@ nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_linkres *resp = rqstp->rq_resp; + *p++ = resp->status; p = encode_post_op_attr(rqstp, p, &resp->fh); p = encode_wcc_data(rqstp, p, &resp->tfh); return xdr_ressize_check(rqstp, p); @@ -801,6 +814,7 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_readdirres *resp = rqstp->rq_resp; + *p++ = resp->status; p = encode_post_op_attr(rqstp, p, &resp->fh); if (resp->status == 0) { @@ -1053,6 +1067,7 @@ nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p) struct kstatfs *s = &resp->stats; u64 bs = s->f_bsize; + *p++ = resp->status; *p++ = xdr_zero; /* no post_op_attr */ if (resp->status == 0) { @@ -1073,6 +1088,7 @@ nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_fsinfores *resp = rqstp->rq_resp; + *p++ = resp->status; *p++ = xdr_zero; /* no post_op_attr */ if (resp->status == 0) { @@ -1118,6 +1134,7 @@ nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_commitres *resp = rqstp->rq_resp; + *p++ = resp->status; p = encode_wcc_data(rqstp, p, &resp->fh); /* Write verifier */ if (resp->status == 0) { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index eaf50eafa935..ad2fa1a8e7ad 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -38,6 +38,7 @@ #include <linux/slab.h> #include <linux/kthread.h> #include <linux/sunrpc/addr.h> +#include <linux/nfs_ssc.h> #include "idmap.h" #include "cache.h" @@ -1247,7 +1248,7 @@ out_err: static void nfsd4_interssc_disconnect(struct vfsmount *ss_mnt) { - nfs_sb_deactive(ss_mnt->mnt_sb); + nfs_do_sb_deactive(ss_mnt->mnt_sb); mntput(ss_mnt); } @@ -2165,7 +2166,7 @@ nfsd4_removexattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, static __be32 nfsd4_proc_null(struct svc_rqst *rqstp) { - return nfs_ok; + return rpc_success; } static inline void nfsd4_increment_op_stats(u32 opnum) @@ -2457,15 +2458,14 @@ encode_op: nfsd4_increment_op_stats(op->opnum); } - cstate->status = status; fh_put(current_fh); fh_put(save_fh); BUG_ON(cstate->replay_owner); out: + cstate->status = status; /* Reset deferral mechanism for RPC deferrals */ set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); - dprintk("nfsv4 compound returned %d\n", ntohl(status)); - return status; + return rpc_success; } #define op_encode_hdr_size (2) @@ -2591,6 +2591,20 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32); } +static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 maxcount = svc_max_payload(rqstp); + u32 rlen = min(op->u.read.rd_length, maxcount); + /* + * If we detect that the file changed during hole encoding, then we + * recover by encoding the remaining reply as data. This means we need + * to set aside enough room to encode two data segments. + */ + u32 seg_len = 2 * (1 + 2 + 1); + + return (op_encode_hdr_size + 2 + seg_len + XDR_QUADLEN(rlen)) * sizeof(__be32); +} + static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { u32 maxcount = 0, rlen = 0; @@ -3163,6 +3177,13 @@ static const struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_COPY", .op_rsize_bop = nfsd4_copy_rsize, }, + [OP_READ_PLUS] = { + .op_func = nfsd4_read, + .op_release = nfsd4_read_release, + .op_name = "OP_READ_PLUS", + .op_rsize_bop = nfsd4_read_plus_rsize, + .op_get_currentstateid = nfsd4_get_readstateid, + }, [OP_SEEK] = { .op_func = nfsd4_seek, .op_name = "OP_SEEK", @@ -3231,7 +3252,7 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) if (!cstate->minorversion) return false; - if (cstate->spo_must_allowed == true) + if (cstate->spo_must_allowed) return true; opiter = resp->opcnt; @@ -3279,6 +3300,7 @@ struct nfsd4_voidargs { int dummy; }; static const struct svc_procedure nfsd_procedures4[2] = { [NFSPROC4_NULL] = { .pc_func = nfsd4_proc_null, + .pc_decode = nfs4svc_decode_voidarg, .pc_encode = nfs4svc_encode_voidres, .pc_argsize = sizeof(struct nfsd4_voidargs), .pc_ressize = sizeof(struct nfsd4_voidres), diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c09a2a4281ec..d7f27ed6b794 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4513,7 +4513,8 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, { struct nfs4_delegation *dp = cb_to_delegation(cb); - if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID) + if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID || + dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) return 1; switch (task->tk_status) { @@ -4597,7 +4598,8 @@ static bool nfsd_breaker_owns_lease(struct file_lock *fl) if (!i_am_nfsd()) return NULL; rqst = kthread_data(current); - if (!rqst->rq_lease_breaker) + /* Note rq_prog == NFS_ACL_PROGRAM is also possible: */ + if (rqst->rq_prog != NFS_PROGRAM || rqst->rq_vers < 4) return NULL; clp = *(rqst->rq_lease_breaker); return dl->dl_stid.sc_client == clp; @@ -4954,7 +4956,6 @@ static int nfsd4_check_conflicting_opens(struct nfs4_client *clp, writes--; if (fp->fi_fds[O_RDWR]) writes--; - WARN_ON_ONCE(writes < 0); if (writes > 0) return -EAGAIN; spin_lock(&fp->fi_lock); @@ -5126,7 +5127,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); - trace_nfsd_deleg_open(&dp->dl_stid.sc_stateid); + trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; nfs4_put_stid(&dp->dl_stid); return; @@ -5243,7 +5244,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf nfs4_open_delegation(current_fh, open, stp); nodeleg: status = nfs_ok; - trace_nfsd_deleg_none(&stp->st_stid.sc_stateid); + trace_nfsd_open(&stp->st_stid.sc_stateid); out: /* 4.1 client trying to upgrade/downgrade delegation? */ if (open->op_delegate_type == NFS4_OPEN_DELEGATE_NONE && dp && @@ -5722,7 +5723,6 @@ nfs4_find_file(struct nfs4_stid *s, int flags) return find_readable_file(s->sc_file); else return find_writeable_file(s->sc_file); - break; } return NULL; @@ -7253,599 +7253,6 @@ nfs4_check_open_reclaim(clientid_t *clid, return nfs_ok; } -#ifdef CONFIG_NFSD_FAULT_INJECTION -static inline void -put_client(struct nfs4_client *clp) -{ - atomic_dec(&clp->cl_rpc_users); -} - -static struct nfs4_client * -nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size) -{ - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - - if (!nfsd_netns_ready(nn)) - return NULL; - - list_for_each_entry(clp, &nn->client_lru, cl_lru) { - if (memcmp(&clp->cl_addr, addr, addr_size) == 0) - return clp; - } - return NULL; -} - -u64 -nfsd_inject_print_clients(void) -{ - struct nfs4_client *clp; - u64 count = 0; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - char buf[INET6_ADDRSTRLEN]; - - if (!nfsd_netns_ready(nn)) - return 0; - - spin_lock(&nn->client_lock); - list_for_each_entry(clp, &nn->client_lru, cl_lru) { - rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); - pr_info("NFS Client: %s\n", buf); - ++count; - } - spin_unlock(&nn->client_lock); - - return count; -} - -u64 -nfsd_inject_forget_client(struct sockaddr_storage *addr, size_t addr_size) -{ - u64 count = 0; - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - clp = nfsd_find_client(addr, addr_size); - if (clp) { - if (mark_client_expired_locked(clp) == nfs_ok) - ++count; - else - clp = NULL; - } - spin_unlock(&nn->client_lock); - - if (clp) - expire_client(clp); - - return count; -} - -u64 -nfsd_inject_forget_clients(u64 max) -{ - u64 count = 0; - struct nfs4_client *clp, *next; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { - if (mark_client_expired_locked(clp) == nfs_ok) { - list_add(&clp->cl_lru, &reaplist); - if (max != 0 && ++count >= max) - break; - } - } - spin_unlock(&nn->client_lock); - - list_for_each_entry_safe(clp, next, &reaplist, cl_lru) - expire_client(clp); - - return count; -} - -static void nfsd_print_count(struct nfs4_client *clp, unsigned int count, - const char *type) -{ - char buf[INET6_ADDRSTRLEN]; - rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); - printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type); -} - -static void -nfsd_inject_add_lock_to_list(struct nfs4_ol_stateid *lst, - struct list_head *collect) -{ - struct nfs4_client *clp = lst->st_stid.sc_client; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - - if (!collect) - return; - - lockdep_assert_held(&nn->client_lock); - atomic_inc(&clp->cl_rpc_users); - list_add(&lst->st_locks, collect); -} - -static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, - struct list_head *collect, - bool (*func)(struct nfs4_ol_stateid *)) -{ - struct nfs4_openowner *oop; - struct nfs4_ol_stateid *stp, *st_next; - struct nfs4_ol_stateid *lst, *lst_next; - u64 count = 0; - - spin_lock(&clp->cl_lock); - list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) { - list_for_each_entry_safe(stp, st_next, - &oop->oo_owner.so_stateids, st_perstateowner) { - list_for_each_entry_safe(lst, lst_next, - &stp->st_locks, st_locks) { - if (func) { - if (func(lst)) - nfsd_inject_add_lock_to_list(lst, - collect); - } - ++count; - /* - * Despite the fact that these functions deal - * with 64-bit integers for "count", we must - * ensure that it doesn't blow up the - * clp->cl_rpc_users. Throw a warning if we - * start to approach INT_MAX here. - */ - WARN_ON_ONCE(count == (INT_MAX / 2)); - if (count == max) - goto out; - } - } - } -out: - spin_unlock(&clp->cl_lock); - - return count; -} - -static u64 -nfsd_collect_client_locks(struct nfs4_client *clp, struct list_head *collect, - u64 max) -{ - return nfsd_foreach_client_lock(clp, max, collect, unhash_lock_stateid); -} - -static u64 -nfsd_print_client_locks(struct nfs4_client *clp) -{ - u64 count = nfsd_foreach_client_lock(clp, 0, NULL, NULL); - nfsd_print_count(clp, count, "locked files"); - return count; -} - -u64 -nfsd_inject_print_locks(void) -{ - struct nfs4_client *clp; - u64 count = 0; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - - if (!nfsd_netns_ready(nn)) - return 0; - - spin_lock(&nn->client_lock); - list_for_each_entry(clp, &nn->client_lru, cl_lru) - count += nfsd_print_client_locks(clp); - spin_unlock(&nn->client_lock); - - return count; -} - -static void -nfsd_reap_locks(struct list_head *reaplist) -{ - struct nfs4_client *clp; - struct nfs4_ol_stateid *stp, *next; - - list_for_each_entry_safe(stp, next, reaplist, st_locks) { - list_del_init(&stp->st_locks); - clp = stp->st_stid.sc_client; - nfs4_put_stid(&stp->st_stid); - put_client(clp); - } -} - -u64 -nfsd_inject_forget_client_locks(struct sockaddr_storage *addr, size_t addr_size) -{ - unsigned int count = 0; - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - clp = nfsd_find_client(addr, addr_size); - if (clp) - count = nfsd_collect_client_locks(clp, &reaplist, 0); - spin_unlock(&nn->client_lock); - nfsd_reap_locks(&reaplist); - return count; -} - -u64 -nfsd_inject_forget_locks(u64 max) -{ - u64 count = 0; - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - list_for_each_entry(clp, &nn->client_lru, cl_lru) { - count += nfsd_collect_client_locks(clp, &reaplist, max - count); - if (max != 0 && count >= max) - break; - } - spin_unlock(&nn->client_lock); - nfsd_reap_locks(&reaplist); - return count; -} - -static u64 -nfsd_foreach_client_openowner(struct nfs4_client *clp, u64 max, - struct list_head *collect, - void (*func)(struct nfs4_openowner *)) -{ - struct nfs4_openowner *oop, *next; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - u64 count = 0; - - lockdep_assert_held(&nn->client_lock); - - spin_lock(&clp->cl_lock); - list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) { - if (func) { - func(oop); - if (collect) { - atomic_inc(&clp->cl_rpc_users); - list_add(&oop->oo_perclient, collect); - } - } - ++count; - /* - * Despite the fact that these functions deal with - * 64-bit integers for "count", we must ensure that - * it doesn't blow up the clp->cl_rpc_users. Throw a - * warning if we start to approach INT_MAX here. - */ - WARN_ON_ONCE(count == (INT_MAX / 2)); - if (count == max) - break; - } - spin_unlock(&clp->cl_lock); - - return count; -} - -static u64 -nfsd_print_client_openowners(struct nfs4_client *clp) -{ - u64 count = nfsd_foreach_client_openowner(clp, 0, NULL, NULL); - - nfsd_print_count(clp, count, "openowners"); - return count; -} - -static u64 -nfsd_collect_client_openowners(struct nfs4_client *clp, - struct list_head *collect, u64 max) -{ - return nfsd_foreach_client_openowner(clp, max, collect, - unhash_openowner_locked); -} - -u64 -nfsd_inject_print_openowners(void) -{ - struct nfs4_client *clp; - u64 count = 0; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - - if (!nfsd_netns_ready(nn)) - return 0; - - spin_lock(&nn->client_lock); - list_for_each_entry(clp, &nn->client_lru, cl_lru) - count += nfsd_print_client_openowners(clp); - spin_unlock(&nn->client_lock); - - return count; -} - -static void -nfsd_reap_openowners(struct list_head *reaplist) -{ - struct nfs4_client *clp; - struct nfs4_openowner *oop, *next; - - list_for_each_entry_safe(oop, next, reaplist, oo_perclient) { - list_del_init(&oop->oo_perclient); - clp = oop->oo_owner.so_client; - release_openowner(oop); - put_client(clp); - } -} - -u64 -nfsd_inject_forget_client_openowners(struct sockaddr_storage *addr, - size_t addr_size) -{ - unsigned int count = 0; - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - clp = nfsd_find_client(addr, addr_size); - if (clp) - count = nfsd_collect_client_openowners(clp, &reaplist, 0); - spin_unlock(&nn->client_lock); - nfsd_reap_openowners(&reaplist); - return count; -} - -u64 -nfsd_inject_forget_openowners(u64 max) -{ - u64 count = 0; - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - list_for_each_entry(clp, &nn->client_lru, cl_lru) { - count += nfsd_collect_client_openowners(clp, &reaplist, - max - count); - if (max != 0 && count >= max) - break; - } - spin_unlock(&nn->client_lock); - nfsd_reap_openowners(&reaplist); - return count; -} - -static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max, - struct list_head *victims) -{ - struct nfs4_delegation *dp, *next; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - u64 count = 0; - - lockdep_assert_held(&nn->client_lock); - - spin_lock(&state_lock); - list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) { - if (victims) { - /* - * It's not safe to mess with delegations that have a - * non-zero dl_time. They might have already been broken - * and could be processed by the laundromat outside of - * the state_lock. Just leave them be. - */ - if (dp->dl_time != 0) - continue; - - atomic_inc(&clp->cl_rpc_users); - WARN_ON(!unhash_delegation_locked(dp)); - list_add(&dp->dl_recall_lru, victims); - } - ++count; - /* - * Despite the fact that these functions deal with - * 64-bit integers for "count", we must ensure that - * it doesn't blow up the clp->cl_rpc_users. Throw a - * warning if we start to approach INT_MAX here. - */ - WARN_ON_ONCE(count == (INT_MAX / 2)); - if (count == max) - break; - } - spin_unlock(&state_lock); - return count; -} - -static u64 -nfsd_print_client_delegations(struct nfs4_client *clp) -{ - u64 count = nfsd_find_all_delegations(clp, 0, NULL); - - nfsd_print_count(clp, count, "delegations"); - return count; -} - -u64 -nfsd_inject_print_delegations(void) -{ - struct nfs4_client *clp; - u64 count = 0; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - - if (!nfsd_netns_ready(nn)) - return 0; - - spin_lock(&nn->client_lock); - list_for_each_entry(clp, &nn->client_lru, cl_lru) - count += nfsd_print_client_delegations(clp); - spin_unlock(&nn->client_lock); - - return count; -} - -static void -nfsd_forget_delegations(struct list_head *reaplist) -{ - struct nfs4_client *clp; - struct nfs4_delegation *dp, *next; - - list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) { - list_del_init(&dp->dl_recall_lru); - clp = dp->dl_stid.sc_client; - revoke_delegation(dp); - put_client(clp); - } -} - -u64 -nfsd_inject_forget_client_delegations(struct sockaddr_storage *addr, - size_t addr_size) -{ - u64 count = 0; - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - clp = nfsd_find_client(addr, addr_size); - if (clp) - count = nfsd_find_all_delegations(clp, 0, &reaplist); - spin_unlock(&nn->client_lock); - - nfsd_forget_delegations(&reaplist); - return count; -} - -u64 -nfsd_inject_forget_delegations(u64 max) -{ - u64 count = 0; - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - list_for_each_entry(clp, &nn->client_lru, cl_lru) { - count += nfsd_find_all_delegations(clp, max - count, &reaplist); - if (max != 0 && count >= max) - break; - } - spin_unlock(&nn->client_lock); - nfsd_forget_delegations(&reaplist); - return count; -} - -static void -nfsd_recall_delegations(struct list_head *reaplist) -{ - struct nfs4_client *clp; - struct nfs4_delegation *dp, *next; - - list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) { - list_del_init(&dp->dl_recall_lru); - clp = dp->dl_stid.sc_client; - - trace_nfsd_deleg_recall(&dp->dl_stid.sc_stateid); - - /* - * We skipped all entries that had a zero dl_time before, - * so we can now reset the dl_time back to 0. If a delegation - * break comes in now, then it won't make any difference since - * we're recalling it either way. - */ - spin_lock(&state_lock); - dp->dl_time = 0; - spin_unlock(&state_lock); - nfsd_break_one_deleg(dp); - put_client(clp); - } -} - -u64 -nfsd_inject_recall_client_delegations(struct sockaddr_storage *addr, - size_t addr_size) -{ - u64 count = 0; - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - clp = nfsd_find_client(addr, addr_size); - if (clp) - count = nfsd_find_all_delegations(clp, 0, &reaplist); - spin_unlock(&nn->client_lock); - - nfsd_recall_delegations(&reaplist); - return count; -} - -u64 -nfsd_inject_recall_delegations(u64 max) -{ - u64 count = 0; - struct nfs4_client *clp, *next; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { - count += nfsd_find_all_delegations(clp, max - count, &reaplist); - if (max != 0 && ++count >= max) - break; - } - spin_unlock(&nn->client_lock); - nfsd_recall_delegations(&reaplist); - return count; -} -#endif /* CONFIG_NFSD_FAULT_INJECTION */ - /* * Since the lifetime of a delegation isn't limited to that of an open, a * client may quite reasonably hang on to a delegation as long as it has diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 259d5ad0e3f4..833a2c64dfe8 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1855,7 +1855,7 @@ static __be32 nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp, struct nfsd4_copy_notify *cn) { - int status; + __be32 status; status = nfsd4_decode_stateid(argp, &cn->cpn_src_stateid); if (status) @@ -2173,7 +2173,7 @@ static const nfsd4_dec nfsd4_dec_ops[] = { [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_offload_status, [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_offload_status, - [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_read, [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek, [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone, @@ -2261,7 +2261,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) */ cachethis |= nfsd4_cache_this_op(op); - if (op->opnum == OP_READ) { + if (op->opnum == OP_READ || op->opnum == OP_READ_PLUS) { readcount++; readbytes += nfsd4_max_reply(argp->rqstp, op); } else @@ -3814,36 +3814,14 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, { struct xdr_stream *xdr = &resp->xdr; u32 eof; - int v; int starting_len = xdr->buf->len - 8; - long len; - int thislen; __be32 nfserr; __be32 tmp; - __be32 *p; int pad; - /* - * svcrdma requires every READ payload to start somewhere - * in xdr->pages. - */ - if (xdr->iov == xdr->buf->head) { - xdr->iov = NULL; - xdr->end = xdr->p; - } - - len = maxcount; - v = 0; - while (len) { - thislen = min_t(long, len, PAGE_SIZE); - p = xdr_reserve_space(xdr, thislen); - WARN_ON_ONCE(!p); - resp->rqstp->rq_vec[v].iov_base = p; - resp->rqstp->rq_vec[v].iov_len = thislen; - v++; - len -= thislen; - } - read->rd_vlen = v; + read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, maxcount); + if (read->rd_vlen < 0) + return nfserr_resource; nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, &maxcount, @@ -4619,6 +4597,149 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, return nfserr_resource; p = xdr_encode_hyper(p, os->count); *p++ = cpu_to_be32(0); + return nfserr; +} + +static __be32 +nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, + struct nfsd4_read *read, + unsigned long *maxcount, u32 *eof, + loff_t *pos) +{ + struct xdr_stream *xdr = &resp->xdr; + struct file *file = read->rd_nf->nf_file; + int starting_len = xdr->buf->len; + loff_t hole_pos; + __be32 nfserr; + __be32 *p, tmp; + __be64 tmp64; + + hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE); + if (hole_pos > read->rd_offset) + *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset); + *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len)); + + /* Content type, offset, byte count */ + p = xdr_reserve_space(xdr, 4 + 8 + 4); + if (!p) + return nfserr_resource; + + read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount); + if (read->rd_vlen < 0) + return nfserr_resource; + + nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, + resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof); + if (nfserr) + return nfserr; + + tmp = htonl(NFS4_CONTENT_DATA); + write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); + tmp64 = cpu_to_be64(read->rd_offset); + write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8); + tmp = htonl(*maxcount); + write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4); + return nfs_ok; +} + +static __be32 +nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, + struct nfsd4_read *read, + unsigned long *maxcount, u32 *eof) +{ + struct file *file = read->rd_nf->nf_file; + loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA); + loff_t f_size = i_size_read(file_inode(file)); + unsigned long count; + __be32 *p; + + if (data_pos == -ENXIO) + data_pos = f_size; + else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE)) + return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size); + count = data_pos - read->rd_offset; + + /* Content type, offset, byte count */ + p = xdr_reserve_space(&resp->xdr, 4 + 8 + 8); + if (!p) + return nfserr_resource; + + *p++ = htonl(NFS4_CONTENT_HOLE); + p = xdr_encode_hyper(p, read->rd_offset); + p = xdr_encode_hyper(p, count); + + *eof = (read->rd_offset + count) >= f_size; + *maxcount = min_t(unsigned long, count, *maxcount); + return nfs_ok; +} + +static __be32 +nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_read *read) +{ + unsigned long maxcount, count; + struct xdr_stream *xdr = &resp->xdr; + struct file *file; + int starting_len = xdr->buf->len; + int last_segment = xdr->buf->len; + int segments = 0; + __be32 *p, tmp; + bool is_data; + loff_t pos; + u32 eof; + + if (nfserr) + return nfserr; + file = read->rd_nf->nf_file; + + /* eof flag, segment count */ + p = xdr_reserve_space(xdr, 4 + 4); + if (!p) + return nfserr_resource; + xdr_commit_encode(xdr); + + maxcount = svc_max_payload(resp->rqstp); + maxcount = min_t(unsigned long, maxcount, + (xdr->buf->buflen - xdr->buf->len)); + maxcount = min_t(unsigned long, maxcount, read->rd_length); + count = maxcount; + + eof = read->rd_offset >= i_size_read(file_inode(file)); + if (eof) + goto out; + + pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE); + is_data = pos > read->rd_offset; + + while (count > 0 && !eof) { + maxcount = count; + if (is_data) + nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof, + segments == 0 ? &pos : NULL); + else + nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof); + if (nfserr) + goto out; + count -= maxcount; + read->rd_offset += maxcount; + is_data = !is_data; + last_segment = xdr->buf->len; + segments++; + } + +out: + if (nfserr && segments == 0) + xdr_truncate_encode(xdr, starting_len); + else { + tmp = htonl(eof); + write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); + tmp = htonl(segments); + write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); + if (nfserr) { + xdr_truncate_encode(xdr, last_segment); + nfserr = nfs_ok; + } + } return nfserr; } @@ -4679,7 +4800,7 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) /* * Encode kmalloc-ed buffer in to XDR stream. */ -static int +static __be32 nfsd4_vbuf_to_stream(struct xdr_stream *xdr, char *buf, u32 buflen) { u32 cplen; @@ -4795,7 +4916,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, u32 xdrlen, offset; u64 cookie; char *sp; - __be32 status; + __be32 status, tmp; __be32 *p; u32 nuser; @@ -4828,7 +4949,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, slen = strlen(sp); /* - * Check if this a user. attribute, skip it if not. + * Check if this is a "user." attribute, skip it if not. */ if (strncmp(sp, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) goto contloop; @@ -4859,7 +4980,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, goto out; } - p = xdr_encode_opaque(p, sp, slen); + xdr_encode_opaque(p, sp, slen); xdrleft -= xdrlen; count++; @@ -4888,8 +5009,8 @@ wreof: cookie = offset + count; write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &cookie, 8); - count = htonl(count); - write_bytes_to_xdr_buf(xdr->buf, count_offset, &count, 4); + tmp = cpu_to_be32(count); + write_bytes_to_xdr_buf(xdr->buf, count_offset, &tmp, 4); out: if (listxattrs->lsxa_len) kvfree(listxattrs->lsxa_buf); @@ -4996,7 +5117,7 @@ static const nfsd4_enc nfsd4_enc_ops[] = { [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop, [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop, [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_offload_status, - [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop, + [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_read_plus, [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek, [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop, [OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop, @@ -5157,6 +5278,12 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp) } int +nfs4svc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p) +{ + return 1; +} + +int nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd4_compoundargs *args = rqstp->rq_argp; @@ -5183,15 +5310,14 @@ nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p) int nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p) { - /* - * All that remains is to write the tag and operation count... - */ struct nfsd4_compoundres *resp = rqstp->rq_resp; struct xdr_buf *buf = resp->xdr.buf; WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len + buf->tail[0].iov_len); + *p = resp->cstate.status; + rqstp->rq_next_page = resp->xdr.page_ptr + 1; p = resp->tagp; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 0a0cf1fd77d3..80c90fc231a5 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -172,14 +172,10 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) if (status) goto out_nomem; - nn->drc_hashtbl = kcalloc(hashsize, - sizeof(*nn->drc_hashtbl), GFP_KERNEL); - if (!nn->drc_hashtbl) { - nn->drc_hashtbl = vzalloc(array_size(hashsize, - sizeof(*nn->drc_hashtbl))); - if (!nn->drc_hashtbl) - goto out_shrinker; - } + nn->drc_hashtbl = kvzalloc(array_size(hashsize, + sizeof(*nn->drc_hashtbl)), GFP_KERNEL); + if (!nn->drc_hashtbl) + goto out_shrinker; for (i = 0; i < hashsize; i++) { INIT_LIST_HEAD(&nn->drc_hashtbl[i].lru_head); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 7ae236113040..f6d5d783f4a4 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1534,7 +1534,6 @@ static int __init init_nfsd(void) retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; - nfsd_fault_inject_init(); /* nfsd fault injection controls */ nfsd_stat_init(); /* Statistics */ retval = nfsd_drc_slab_create(); if (retval) @@ -1555,7 +1554,6 @@ out_free_lockd: nfsd_drc_slab_free(); out_free_stat: nfsd_stat_shutdown(); - nfsd_fault_inject_cleanup(); nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); @@ -1575,7 +1573,6 @@ static void __exit exit_nfsd(void) nfsd_lockd_shutdown(); nfsd4_free_slabs(); nfsd4_exit_pnfs(); - nfsd_fault_inject_cleanup(); unregister_filesystem(&nfsd_fs_type); unregister_cld_notifier(); unregister_pernet_subsys(&nfsd_net_ops); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 6e0b066480c5..0d71549f9d42 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -11,30 +11,14 @@ #include "xdr.h" #include "vfs.h" -typedef struct svc_rqst svc_rqst; -typedef struct svc_buf svc_buf; - #define NFSDDBG_FACILITY NFSDDBG_PROC - static __be32 nfsd_proc_null(struct svc_rqst *rqstp) { - return nfs_ok; + return rpc_success; } -static __be32 -nfsd_return_attrs(__be32 err, struct nfsd_attrstat *resp) -{ - if (err) return err; - return fh_getattr(&resp->fh, &resp->stat); -} -static __be32 -nfsd_return_dirop(__be32 err, struct nfsd_diropres *resp) -{ - if (err) return err; - return fh_getattr(&resp->fh, &resp->stat); -} /* * Get a file's attributes * N.B. After this call resp->fh needs an fh_put @@ -44,13 +28,17 @@ nfsd_proc_getattr(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_attrstat *resp = rqstp->rq_resp; - __be32 nfserr; + dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, - NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); - return nfsd_return_attrs(nfserr, resp); + resp->status = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); + if (resp->status != nfs_ok) + goto out; + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; } /* @@ -64,7 +52,6 @@ nfsd_proc_setattr(struct svc_rqst *rqstp) struct nfsd_attrstat *resp = rqstp->rq_resp; struct iattr *iap = &argp->attrs; struct svc_fh *fhp; - __be32 nfserr; dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n", SVCFH_fmt(&argp->fh), @@ -96,9 +83,9 @@ nfsd_proc_setattr(struct svc_rqst *rqstp) */ time64_t delta = iap->ia_atime.tv_sec - ktime_get_real_seconds(); - nfserr = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); - if (nfserr) - goto done; + resp->status = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); + if (resp->status != nfs_ok) + goto out; if (delta < 0) delta = -delta; @@ -113,9 +100,20 @@ nfsd_proc_setattr(struct svc_rqst *rqstp) } } - nfserr = nfsd_setattr(rqstp, fhp, iap, 0, (time64_t)0); -done: - return nfsd_return_attrs(nfserr, resp); + resp->status = nfsd_setattr(rqstp, fhp, iap, 0, (time64_t)0); + if (resp->status != nfs_ok) + goto out; + + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; +} + +/* Obsolete, replaced by MNTPROC_MNT. */ +static __be32 +nfsd_proc_root(struct svc_rqst *rqstp) +{ + return rpc_success; } /* @@ -129,17 +127,20 @@ nfsd_proc_lookup(struct svc_rqst *rqstp) { struct nfsd_diropargs *argp = rqstp->rq_argp; struct nfsd_diropres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: LOOKUP %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); fh_init(&resp->fh, NFS_FHSIZE); - nfserr = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len, - &resp->fh); - + resp->status = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len, + &resp->fh); fh_put(&argp->fh); - return nfsd_return_dirop(nfserr, resp); + if (resp->status != nfs_ok) + goto out; + + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; } /* @@ -150,16 +151,15 @@ nfsd_proc_readlink(struct svc_rqst *rqstp) { struct nfsd_readlinkargs *argp = rqstp->rq_argp; struct nfsd_readlinkres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh)); /* Read the symlink. */ resp->len = NFS_MAXPATHLEN; - nfserr = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len); + resp->status = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len); fh_put(&argp->fh); - return nfserr; + return rpc_success; } /* @@ -171,7 +171,6 @@ nfsd_proc_read(struct svc_rqst *rqstp) { struct nfsd_readargs *argp = rqstp->rq_argp; struct nfsd_readres *resp = rqstp->rq_resp; - __be32 nfserr; u32 eof; dprintk("nfsd: READ %s %d bytes at %d\n", @@ -193,14 +192,23 @@ nfsd_proc_read(struct svc_rqst *rqstp) svc_reserve_auth(rqstp, (19<<2) + argp->count + 4); resp->count = argp->count; - nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), - argp->offset, - rqstp->rq_vec, argp->vlen, - &resp->count, - &eof); - - if (nfserr) return nfserr; - return fh_getattr(&resp->fh, &resp->stat); + resp->status = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), + argp->offset, + rqstp->rq_vec, argp->vlen, + &resp->count, + &eof); + if (resp->status == nfs_ok) + resp->status = fh_getattr(&resp->fh, &resp->stat); + else if (resp->status == nfserr_jukebox) + return rpc_drop_reply; + return rpc_success; +} + +/* Reserved */ +static __be32 +nfsd_proc_writecache(struct svc_rqst *rqstp) +{ + return rpc_success; } /* @@ -212,7 +220,6 @@ nfsd_proc_write(struct svc_rqst *rqstp) { struct nfsd_writeargs *argp = rqstp->rq_argp; struct nfsd_attrstat *resp = rqstp->rq_resp; - __be32 nfserr; unsigned long cnt = argp->len; unsigned int nvecs; @@ -222,12 +229,20 @@ nfsd_proc_write(struct svc_rqst *rqstp) nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages, &argp->first, cnt); - if (!nvecs) - return nfserr_io; - nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), - argp->offset, rqstp->rq_vec, nvecs, - &cnt, NFS_DATA_SYNC, NULL); - return nfsd_return_attrs(nfserr, resp); + if (!nvecs) { + resp->status = nfserr_io; + goto out; + } + + resp->status = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), + argp->offset, rqstp->rq_vec, nvecs, + &cnt, NFS_DATA_SYNC, NULL); + if (resp->status == nfs_ok) + resp->status = fh_getattr(&resp->fh, &resp->stat); + else if (resp->status == nfserr_jukebox) + return rpc_drop_reply; +out: + return rpc_success; } /* @@ -247,7 +262,6 @@ nfsd_proc_create(struct svc_rqst *rqstp) struct inode *inode; struct dentry *dchild; int type, mode; - __be32 nfserr; int hosterr; dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size); @@ -255,40 +269,40 @@ nfsd_proc_create(struct svc_rqst *rqstp) SVCFH_fmt(dirfhp), argp->len, argp->name); /* First verify the parent file handle */ - nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC); - if (nfserr) + resp->status = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC); + if (resp->status != nfs_ok) goto done; /* must fh_put dirfhp even on error */ /* Check for NFSD_MAY_WRITE in nfsd_create if necessary */ - nfserr = nfserr_exist; + resp->status = nfserr_exist; if (isdotent(argp->name, argp->len)) goto done; hosterr = fh_want_write(dirfhp); if (hosterr) { - nfserr = nfserrno(hosterr); + resp->status = nfserrno(hosterr); goto done; } fh_lock_nested(dirfhp, I_MUTEX_PARENT); dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len); if (IS_ERR(dchild)) { - nfserr = nfserrno(PTR_ERR(dchild)); + resp->status = nfserrno(PTR_ERR(dchild)); goto out_unlock; } fh_init(newfhp, NFS_FHSIZE); - nfserr = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp); - if (!nfserr && d_really_is_negative(dchild)) - nfserr = nfserr_noent; + resp->status = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp); + if (!resp->status && d_really_is_negative(dchild)) + resp->status = nfserr_noent; dput(dchild); - if (nfserr) { - if (nfserr != nfserr_noent) + if (resp->status) { + if (resp->status != nfserr_noent) goto out_unlock; /* * If the new file handle wasn't verified, we can't tell * whether the file exists or not. Time to bail ... */ - nfserr = nfserr_acces; + resp->status = nfserr_acces; if (!newfhp->fh_dentry) { printk(KERN_WARNING "nfsd_proc_create: file handle not verified\n"); @@ -321,11 +335,11 @@ nfsd_proc_create(struct svc_rqst *rqstp) * echo thing > device-special-file-or-pipe * by doing a CREATE with type==0 */ - nfserr = nfsd_permission(rqstp, + resp->status = nfsd_permission(rqstp, newfhp->fh_export, newfhp->fh_dentry, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS); - if (nfserr && nfserr != nfserr_rofs) + if (resp->status && resp->status != nfserr_rofs) goto out_unlock; } } else @@ -361,16 +375,17 @@ nfsd_proc_create(struct svc_rqst *rqstp) attr->ia_valid &= ~ATTR_SIZE; /* Make sure the type and device matches */ - nfserr = nfserr_exist; + resp->status = nfserr_exist; if (inode && type != (inode->i_mode & S_IFMT)) goto out_unlock; } - nfserr = 0; + resp->status = nfs_ok; if (!inode) { /* File doesn't exist. Create it and set attrs */ - nfserr = nfsd_create_locked(rqstp, dirfhp, argp->name, - argp->len, attr, type, rdev, newfhp); + resp->status = nfsd_create_locked(rqstp, dirfhp, argp->name, + argp->len, attr, type, rdev, + newfhp); } else if (type == S_IFREG) { dprintk("nfsd: existing %s, valid=%x, size=%ld\n", argp->name, attr->ia_valid, (long) attr->ia_size); @@ -380,7 +395,8 @@ nfsd_proc_create(struct svc_rqst *rqstp) */ attr->ia_valid &= ATTR_SIZE; if (attr->ia_valid) - nfserr = nfsd_setattr(rqstp, newfhp, attr, 0, (time64_t)0); + resp->status = nfsd_setattr(rqstp, newfhp, attr, 0, + (time64_t)0); } out_unlock: @@ -389,47 +405,52 @@ out_unlock: fh_drop_write(dirfhp); done: fh_put(dirfhp); - return nfsd_return_dirop(nfserr, resp); + if (resp->status != nfs_ok) + goto out; + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; } static __be32 nfsd_proc_remove(struct svc_rqst *rqstp) { struct nfsd_diropargs *argp = rqstp->rq_argp; - __be32 nfserr; + struct nfsd_stat *resp = rqstp->rq_resp; dprintk("nfsd: REMOVE %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); /* Unlink. -SIFDIR means file must not be a directory */ - nfserr = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR, argp->name, argp->len); + resp->status = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR, + argp->name, argp->len); fh_put(&argp->fh); - return nfserr; + return rpc_success; } static __be32 nfsd_proc_rename(struct svc_rqst *rqstp) { struct nfsd_renameargs *argp = rqstp->rq_argp; - __be32 nfserr; + struct nfsd_stat *resp = rqstp->rq_resp; dprintk("nfsd: RENAME %s %.*s -> \n", SVCFH_fmt(&argp->ffh), argp->flen, argp->fname); dprintk("nfsd: -> %s %.*s\n", SVCFH_fmt(&argp->tfh), argp->tlen, argp->tname); - nfserr = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen, - &argp->tfh, argp->tname, argp->tlen); + resp->status = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen, + &argp->tfh, argp->tname, argp->tlen); fh_put(&argp->ffh); fh_put(&argp->tfh); - return nfserr; + return rpc_success; } static __be32 nfsd_proc_link(struct svc_rqst *rqstp) { struct nfsd_linkargs *argp = rqstp->rq_argp; - __be32 nfserr; + struct nfsd_stat *resp = rqstp->rq_resp; dprintk("nfsd: LINK %s ->\n", SVCFH_fmt(&argp->ffh)); @@ -438,41 +459,46 @@ nfsd_proc_link(struct svc_rqst *rqstp) argp->tlen, argp->tname); - nfserr = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen, - &argp->ffh); + resp->status = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen, + &argp->ffh); fh_put(&argp->ffh); fh_put(&argp->tfh); - return nfserr; + return rpc_success; } static __be32 nfsd_proc_symlink(struct svc_rqst *rqstp) { struct nfsd_symlinkargs *argp = rqstp->rq_argp; + struct nfsd_stat *resp = rqstp->rq_resp; struct svc_fh newfh; - __be32 nfserr; - if (argp->tlen > NFS_MAXPATHLEN) - return nfserr_nametoolong; + if (argp->tlen > NFS_MAXPATHLEN) { + resp->status = nfserr_nametoolong; + goto out; + } argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first, page_address(rqstp->rq_arg.pages[0]), argp->tlen); - if (IS_ERR(argp->tname)) - return nfserrno(PTR_ERR(argp->tname)); + if (IS_ERR(argp->tname)) { + resp->status = nfserrno(PTR_ERR(argp->tname)); + goto out; + } dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n", SVCFH_fmt(&argp->ffh), argp->flen, argp->fname, argp->tlen, argp->tname); fh_init(&newfh, NFS_FHSIZE); - nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen, - argp->tname, &newfh); + resp->status = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen, + argp->tname, &newfh); kfree(argp->tname); fh_put(&argp->ffh); fh_put(&newfh); - return nfserr; +out: + return rpc_success; } /* @@ -484,7 +510,6 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp) { struct nfsd_createargs *argp = rqstp->rq_argp; struct nfsd_diropres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); @@ -495,10 +520,15 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp) argp->attrs.ia_valid &= ~ATTR_SIZE; fh_init(&resp->fh, NFS_FHSIZE); - nfserr = nfsd_create(rqstp, &argp->fh, argp->name, argp->len, - &argp->attrs, S_IFDIR, 0, &resp->fh); + resp->status = nfsd_create(rqstp, &argp->fh, argp->name, argp->len, + &argp->attrs, S_IFDIR, 0, &resp->fh); fh_put(&argp->fh); - return nfsd_return_dirop(nfserr, resp); + if (resp->status != nfs_ok) + goto out; + + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; } /* @@ -508,13 +538,14 @@ static __be32 nfsd_proc_rmdir(struct svc_rqst *rqstp) { struct nfsd_diropargs *argp = rqstp->rq_argp; - __be32 nfserr; + struct nfsd_stat *resp = rqstp->rq_resp; dprintk("nfsd: RMDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); - nfserr = nfsd_unlink(rqstp, &argp->fh, S_IFDIR, argp->name, argp->len); + resp->status = nfsd_unlink(rqstp, &argp->fh, S_IFDIR, + argp->name, argp->len); fh_put(&argp->fh); - return nfserr; + return rpc_success; } /* @@ -526,7 +557,6 @@ nfsd_proc_readdir(struct svc_rqst *rqstp) struct nfsd_readdirargs *argp = rqstp->rq_argp; struct nfsd_readdirres *resp = rqstp->rq_resp; int count; - __be32 nfserr; loff_t offset; dprintk("nfsd: READDIR %s %d bytes at %d\n", @@ -547,15 +577,15 @@ nfsd_proc_readdir(struct svc_rqst *rqstp) resp->common.err = nfs_ok; /* Read directory and encode entries on the fly */ offset = argp->cookie; - nfserr = nfsd_readdir(rqstp, &argp->fh, &offset, - &resp->common, nfssvc_encode_entry); + resp->status = nfsd_readdir(rqstp, &argp->fh, &offset, + &resp->common, nfssvc_encode_entry); resp->count = resp->buffer - argp->buffer; if (resp->offset) *resp->offset = htonl(offset); fh_put(&argp->fh); - return nfserr; + return rpc_success; } /* @@ -566,14 +596,13 @@ nfsd_proc_statfs(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_statfsres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh)); - nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, - NFSD_MAY_BYPASS_GSS_ON_ROOT); + resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats, + NFSD_MAY_BYPASS_GSS_ON_ROOT); fh_put(&argp->fh); - return nfserr; + return rpc_success; } /* @@ -594,13 +623,13 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_argsize = sizeof(struct nfsd_void), .pc_ressize = sizeof(struct nfsd_void), .pc_cachetype = RC_NOCACHE, - .pc_xdrressize = ST, + .pc_xdrressize = 0, }, [NFSPROC_GETATTR] = { .pc_func = nfsd_proc_getattr, .pc_decode = nfssvc_decode_fhandle, .pc_encode = nfssvc_encode_attrstat, - .pc_release = nfssvc_release_fhandle, + .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, @@ -610,25 +639,26 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_func = nfsd_proc_setattr, .pc_decode = nfssvc_decode_sattrargs, .pc_encode = nfssvc_encode_attrstat, - .pc_release = nfssvc_release_fhandle, + .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_sattrargs), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+AT, }, [NFSPROC_ROOT] = { + .pc_func = nfsd_proc_root, .pc_decode = nfssvc_decode_void, .pc_encode = nfssvc_encode_void, .pc_argsize = sizeof(struct nfsd_void), .pc_ressize = sizeof(struct nfsd_void), .pc_cachetype = RC_NOCACHE, - .pc_xdrressize = ST, + .pc_xdrressize = 0, }, [NFSPROC_LOOKUP] = { .pc_func = nfsd_proc_lookup, .pc_decode = nfssvc_decode_diropargs, .pc_encode = nfssvc_encode_diropres, - .pc_release = nfssvc_release_fhandle, + .pc_release = nfssvc_release_diropres, .pc_argsize = sizeof(struct nfsd_diropargs), .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_NOCACHE, @@ -647,25 +677,26 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_func = nfsd_proc_read, .pc_decode = nfssvc_decode_readargs, .pc_encode = nfssvc_encode_readres, - .pc_release = nfssvc_release_fhandle, + .pc_release = nfssvc_release_readres, .pc_argsize = sizeof(struct nfsd_readargs), .pc_ressize = sizeof(struct nfsd_readres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4, }, [NFSPROC_WRITECACHE] = { + .pc_func = nfsd_proc_writecache, .pc_decode = nfssvc_decode_void, .pc_encode = nfssvc_encode_void, .pc_argsize = sizeof(struct nfsd_void), .pc_ressize = sizeof(struct nfsd_void), .pc_cachetype = RC_NOCACHE, - .pc_xdrressize = ST, + .pc_xdrressize = 0, }, [NFSPROC_WRITE] = { .pc_func = nfsd_proc_write, .pc_decode = nfssvc_decode_writeargs, .pc_encode = nfssvc_encode_attrstat, - .pc_release = nfssvc_release_fhandle, + .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_writeargs), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_REPLBUFF, @@ -675,7 +706,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_func = nfsd_proc_create, .pc_decode = nfssvc_decode_createargs, .pc_encode = nfssvc_encode_diropres, - .pc_release = nfssvc_release_fhandle, + .pc_release = nfssvc_release_diropres, .pc_argsize = sizeof(struct nfsd_createargs), .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_REPLBUFF, @@ -684,36 +715,36 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_REMOVE] = { .pc_func = nfsd_proc_remove, .pc_decode = nfssvc_decode_diropargs, - .pc_encode = nfssvc_encode_void, + .pc_encode = nfssvc_encode_stat, .pc_argsize = sizeof(struct nfsd_diropargs), - .pc_ressize = sizeof(struct nfsd_void), + .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, }, [NFSPROC_RENAME] = { .pc_func = nfsd_proc_rename, .pc_decode = nfssvc_decode_renameargs, - .pc_encode = nfssvc_encode_void, + .pc_encode = nfssvc_encode_stat, .pc_argsize = sizeof(struct nfsd_renameargs), - .pc_ressize = sizeof(struct nfsd_void), + .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, }, [NFSPROC_LINK] = { .pc_func = nfsd_proc_link, .pc_decode = nfssvc_decode_linkargs, - .pc_encode = nfssvc_encode_void, + .pc_encode = nfssvc_encode_stat, .pc_argsize = sizeof(struct nfsd_linkargs), - .pc_ressize = sizeof(struct nfsd_void), + .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, }, [NFSPROC_SYMLINK] = { .pc_func = nfsd_proc_symlink, .pc_decode = nfssvc_decode_symlinkargs, - .pc_encode = nfssvc_encode_void, + .pc_encode = nfssvc_encode_stat, .pc_argsize = sizeof(struct nfsd_symlinkargs), - .pc_ressize = sizeof(struct nfsd_void), + .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, }, @@ -721,7 +752,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_func = nfsd_proc_mkdir, .pc_decode = nfssvc_decode_createargs, .pc_encode = nfssvc_encode_diropres, - .pc_release = nfssvc_release_fhandle, + .pc_release = nfssvc_release_diropres, .pc_argsize = sizeof(struct nfsd_createargs), .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_REPLBUFF, @@ -730,9 +761,9 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_RMDIR] = { .pc_func = nfsd_proc_rmdir, .pc_decode = nfssvc_decode_diropargs, - .pc_encode = nfssvc_encode_void, + .pc_encode = nfssvc_encode_stat, .pc_argsize = sizeof(struct nfsd_diropargs), - .pc_ressize = sizeof(struct nfsd_void), + .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, }, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index f7f6473578af..27b1ad136150 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -960,15 +960,6 @@ out: return 0; } -static __be32 map_new_errors(u32 vers, __be32 nfserr) -{ - if (nfserr == nfserr_jukebox && vers == 2) - return nfserr_dropit; - if (nfserr == nfserr_wrongsec && vers < 4) - return nfserr_acces; - return nfserr; -} - /* * A write procedure can have a large argument, and a read procedure can * have a large reply, but no NFSv2 or NFSv3 procedure has argument and @@ -1000,80 +991,85 @@ static bool nfs_request_too_big(struct svc_rqst *rqstp, return rqstp->rq_arg.len > PAGE_SIZE; } -int -nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) +/** + * nfsd_dispatch - Process an NFS or NFSACL Request + * @rqstp: incoming request + * @statp: pointer to location of accept_stat field in RPC Reply buffer + * + * This RPC dispatcher integrates the NFS server's duplicate reply cache. + * + * Return values: + * %0: Processing complete; do not send a Reply + * %1: Processing complete; send Reply in rqstp->rq_res + */ +int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) { - const struct svc_procedure *proc; - __be32 nfserr; - __be32 *nfserrp; + const struct svc_procedure *proc = rqstp->rq_procinfo; + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec *resv = &rqstp->rq_res.head[0]; + __be32 *p; dprintk("nfsd_dispatch: vers %d proc %d\n", rqstp->rq_vers, rqstp->rq_proc); - proc = rqstp->rq_procinfo; - if (nfs_request_too_big(rqstp, proc)) { - dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers); - *statp = rpc_garbage_args; - return 1; - } - rqstp->rq_lease_breaker = NULL; + if (nfs_request_too_big(rqstp, proc)) + goto out_too_large; + /* * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) */ rqstp->rq_cachetype = proc->pc_cachetype; - /* Decode arguments */ - if (proc->pc_decode && - !proc->pc_decode(rqstp, (__be32*)rqstp->rq_arg.head[0].iov_base)) { - dprintk("nfsd: failed to decode arguments!\n"); - *statp = rpc_garbage_args; - return 1; - } + if (!proc->pc_decode(rqstp, argv->iov_base)) + goto out_decode_err; - /* Check whether we have this call in the cache. */ switch (nfsd_cache_lookup(rqstp)) { - case RC_DROPIT: - return 0; + case RC_DOIT: + break; case RC_REPLY: - return 1; - case RC_DOIT:; - /* do it */ + goto out_cached_reply; + case RC_DROPIT: + goto out_dropit; } - /* need to grab the location to store the status, as - * nfsv4 does some encoding while processing + /* + * Need to grab the location to store the status, as + * NFSv4 does some encoding while processing */ - nfserrp = rqstp->rq_res.head[0].iov_base - + rqstp->rq_res.head[0].iov_len; - rqstp->rq_res.head[0].iov_len += sizeof(__be32); - - /* Now call the procedure handler, and encode NFS status. */ - nfserr = proc->pc_func(rqstp); - nfserr = map_new_errors(rqstp->rq_vers, nfserr); - if (nfserr == nfserr_dropit || test_bit(RQ_DROPME, &rqstp->rq_flags)) { - dprintk("nfsd: Dropping request; may be revisited later\n"); - nfsd_cache_update(rqstp, RC_NOCACHE, NULL); - return 0; - } + p = resv->iov_base + resv->iov_len; + resv->iov_len += sizeof(__be32); - if (rqstp->rq_proc != 0) - *nfserrp++ = nfserr; + *statp = proc->pc_func(rqstp); + if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags)) + goto out_update_drop; - /* Encode result. - * For NFSv2, additional info is never returned in case of an error. - */ - if (!(nfserr && rqstp->rq_vers == 2)) { - if (proc->pc_encode && !proc->pc_encode(rqstp, nfserrp)) { - /* Failed to encode result. Release cache entry */ - dprintk("nfsd: failed to encode result!\n"); - nfsd_cache_update(rqstp, RC_NOCACHE, NULL); - *statp = rpc_system_err; - return 1; - } - } + if (!proc->pc_encode(rqstp, p)) + goto out_encode_err; - /* Store reply in cache. */ nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1); +out_cached_reply: + return 1; + +out_too_large: + dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers); + *statp = rpc_garbage_args; + return 1; + +out_decode_err: + dprintk("nfsd: failed to decode arguments!\n"); + *statp = rpc_garbage_args; + return 1; + +out_update_drop: + dprintk("nfsd: Dropping request; may be revisited later\n"); + nfsd_cache_update(rqstp, RC_NOCACHE, NULL); +out_dropit: + return 0; + +out_encode_err: + dprintk("nfsd: failed to encode result!\n"); + nfsd_cache_update(rqstp, RC_NOCACHE, NULL); + *statp = rpc_system_err; return 1; } diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index b51fe515f06f..8a288c8fcd57 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -430,11 +430,24 @@ nfssvc_encode_void(struct svc_rqst *rqstp, __be32 *p) } int +nfssvc_encode_stat(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_stat *resp = rqstp->rq_resp; + + *p++ = resp->status; + return xdr_ressize_check(rqstp, p); +} + +int nfssvc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p) { struct nfsd_attrstat *resp = rqstp->rq_resp; + *p++ = resp->status; + if (resp->status != nfs_ok) + goto out; p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); +out: return xdr_ressize_check(rqstp, p); } @@ -443,8 +456,12 @@ nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd_diropres *resp = rqstp->rq_resp; + *p++ = resp->status; + if (resp->status != nfs_ok) + goto out; p = encode_fh(p, &resp->fh); p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); +out: return xdr_ressize_check(rqstp, p); } @@ -453,6 +470,10 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd_readlinkres *resp = rqstp->rq_resp; + *p++ = resp->status; + if (resp->status != nfs_ok) + return xdr_ressize_check(rqstp, p); + *p++ = htonl(resp->len); xdr_ressize_check(rqstp, p); rqstp->rq_res.page_len = resp->len; @@ -470,6 +491,10 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd_readres *resp = rqstp->rq_resp; + *p++ = resp->status; + if (resp->status != nfs_ok) + return xdr_ressize_check(rqstp, p); + p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); *p++ = htonl(resp->count); xdr_ressize_check(rqstp, p); @@ -490,6 +515,10 @@ nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd_readdirres *resp = rqstp->rq_resp; + *p++ = resp->status; + if (resp->status != nfs_ok) + return xdr_ressize_check(rqstp, p); + xdr_ressize_check(rqstp, p); p = resp->buffer; *p++ = 0; /* no more entries */ @@ -505,6 +534,10 @@ nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p) struct nfsd_statfsres *resp = rqstp->rq_resp; struct kstatfs *stat = &resp->stats; + *p++ = resp->status; + if (resp->status != nfs_ok) + return xdr_ressize_check(rqstp, p); + *p++ = htonl(NFSSVC_MAXBLKSIZE_V2); /* max transfer size */ *p++ = htonl(stat->f_bsize); *p++ = htonl(stat->f_blocks); @@ -561,10 +594,23 @@ nfssvc_encode_entry(void *ccdv, const char *name, /* * XDR release functions */ -void -nfssvc_release_fhandle(struct svc_rqst *rqstp) +void nfssvc_release_attrstat(struct svc_rqst *rqstp) { - struct nfsd_fhandle *resp = rqstp->rq_resp; + struct nfsd_attrstat *resp = rqstp->rq_resp; + + fh_put(&resp->fh); +} + +void nfssvc_release_diropres(struct svc_rqst *rqstp) +{ + struct nfsd_diropres *resp = rqstp->rq_resp; + + fh_put(&resp->fh); +} + +void nfssvc_release_readres(struct svc_rqst *rqstp) +{ + struct nfsd_readres *resp = rqstp->rq_resp; fh_put(&resp->fh); } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 3b408532a5dc..9eae11a9d21c 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -693,31 +693,4 @@ extern void nfsd4_client_record_remove(struct nfs4_client *clp); extern int nfsd4_client_record_check(struct nfs4_client *clp); extern void nfsd4_record_grace_done(struct nfsd_net *nn); -/* nfs fault injection functions */ -#ifdef CONFIG_NFSD_FAULT_INJECTION -void nfsd_fault_inject_init(void); -void nfsd_fault_inject_cleanup(void); - -u64 nfsd_inject_print_clients(void); -u64 nfsd_inject_forget_client(struct sockaddr_storage *, size_t); -u64 nfsd_inject_forget_clients(u64); - -u64 nfsd_inject_print_locks(void); -u64 nfsd_inject_forget_client_locks(struct sockaddr_storage *, size_t); -u64 nfsd_inject_forget_locks(u64); - -u64 nfsd_inject_print_openowners(void); -u64 nfsd_inject_forget_client_openowners(struct sockaddr_storage *, size_t); -u64 nfsd_inject_forget_openowners(u64); - -u64 nfsd_inject_print_delegations(void); -u64 nfsd_inject_forget_client_delegations(struct sockaddr_storage *, size_t); -u64 nfsd_inject_forget_delegations(u64); -u64 nfsd_inject_recall_client_delegations(struct sockaddr_storage *, size_t); -u64 nfsd_inject_recall_delegations(u64); -#else /* CONFIG_NFSD_FAULT_INJECTION */ -static inline void nfsd_fault_inject_init(void) {} -static inline void nfsd_fault_inject_cleanup(void) {} -#endif /* CONFIG_NFSD_FAULT_INJECTION */ - #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 1861db1bdc67..99bf07800cd0 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -289,8 +289,8 @@ DEFINE_STATEID_EVENT(layout_recall_done); DEFINE_STATEID_EVENT(layout_recall_fail); DEFINE_STATEID_EVENT(layout_recall_release); -DEFINE_STATEID_EVENT(deleg_open); -DEFINE_STATEID_EVENT(deleg_none); +DEFINE_STATEID_EVENT(open); +DEFINE_STATEID_EVENT(deleg_read); DEFINE_STATEID_EVENT(deleg_break); DEFINE_STATEID_EVENT(deleg_recall); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index aba5af9df328..1ecaceebee13 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -2259,7 +2259,8 @@ out: __be32 nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name) { - int err, ret; + __be32 err; + int ret; err = fh_verify(rqstp, fhp, 0, NFSD_MAY_WRITE); if (err) @@ -2283,7 +2284,8 @@ __be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, void *buf, u32 len, u32 flags) { - int err, ret; + __be32 err; + int ret; err = fh_verify(rqstp, fhp, 0, NFSD_MAY_WRITE); if (err) diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index ea7cca3a64b7..0ff336b0b25f 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -82,27 +82,37 @@ struct nfsd_readdirargs { __be32 * buffer; }; +struct nfsd_stat { + __be32 status; +}; + struct nfsd_attrstat { + __be32 status; struct svc_fh fh; struct kstat stat; }; struct nfsd_diropres { + __be32 status; struct svc_fh fh; struct kstat stat; }; struct nfsd_readlinkres { + __be32 status; int len; }; struct nfsd_readres { + __be32 status; struct svc_fh fh; unsigned long count; struct kstat stat; }; struct nfsd_readdirres { + __be32 status; + int count; struct readdir_cd common; @@ -112,6 +122,7 @@ struct nfsd_readdirres { }; struct nfsd_statfsres { + __be32 status; struct kstatfs stats; }; @@ -146,6 +157,7 @@ int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *); int nfssvc_encode_void(struct svc_rqst *, __be32 *); +int nfssvc_encode_stat(struct svc_rqst *, __be32 *); int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *); int nfssvc_encode_diropres(struct svc_rqst *, __be32 *); int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *); @@ -156,7 +168,9 @@ int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *); int nfssvc_encode_entry(void *, const char *name, int namlen, loff_t offset, u64 ino, unsigned int); -void nfssvc_release_fhandle(struct svc_rqst *); +void nfssvc_release_attrstat(struct svc_rqst *rqstp); +void nfssvc_release_diropres(struct svc_rqst *rqstp); +void nfssvc_release_readres(struct svc_rqst *rqstp); /* Helper functions for NFSv2 ACL code */ __be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat); diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 4155fd71714c..ae6fa6c9cb46 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -273,6 +273,7 @@ union nfsd3_xdrstore { #define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore) +int nfs3svc_decode_voidarg(struct svc_rqst *, __be32 *); int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *); int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 66499fb6b567..679d40af1bbb 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -781,6 +781,7 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp); +int nfs4svc_decode_voidarg(struct svc_rqst *, __be32 *); int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *); int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *); int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 2eee5fb1a882..4abd928b0bc8 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -651,8 +651,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = nmaxinodes; buf->f_ffree = nfreeinodes; buf->f_namelen = NILFS_NAME_LEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 7dc3bc604f78..0d7e948cb29c 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2643,8 +2643,7 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) * the least significant 32-bits in f_fsid[0] and the most significant * 32-bits in f_fsid[1]. */ - sfs->f_fsid.val[0] = vol->serial_no & 0xffffffff; - sfs->f_fsid.val[1] = (vol->serial_no >> 32) & 0xffffffff; + sfs->f_fsid = u64_to_fsid(vol->serial_no); /* Maximum length of filenames. */ sfs->f_namelen = NTFS_MAX_NAME_LEN; return 0; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index b425f0b01dce..b9a9d69dde7e 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -883,6 +883,10 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) OCFS2_JOURNAL_DIRTY_FL); journal->j_journal = j_journal; + journal->j_journal->j_submit_inode_data_buffers = + jbd2_journal_submit_inode_data_buffers; + journal->j_journal->j_finish_inode_data_buffers = + jbd2_journal_finish_inode_data_buffers; journal->j_inode = inode; journal->j_bh = bh; diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index b76ec6b88ded..ce93ccca8639 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -282,8 +282,7 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = sbi->s_num_blocks; buf->f_files = sbi->s_num_blocks; buf->f_namelen = OMFS_NAMELEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_bfree = buf->f_bavail = buf->f_ffree = omfs_count_free(s); @@ -363,12 +362,11 @@ static int omfs_get_imap(struct super_block *sb) bh = sb_bread(sb, block++); if (!bh) goto nomem_free; - *ptr = kmalloc(sb->s_blocksize, GFP_KERNEL); + *ptr = kmemdup(bh->b_data, sb->s_blocksize, GFP_KERNEL); if (!*ptr) { brelse(bh); goto nomem_free; } - memcpy(*ptr, bh->b_data, sb->s_blocksize); if (count < sb->s_blocksize) memset((void *)*ptr + count, 0xff, sb->s_blocksize - count); diff --git a/fs/proc/base.c b/fs/proc/base.c index aa69c35d904c..0f707003dda5 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1268,6 +1268,10 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, kuid_t kloginuid; int rv; + /* Don't let kthreads write their own loginuid */ + if (current->flags & PF_KTHREAD) + return -EPERM; + rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { rcu_read_unlock(); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 28d6105e908e..58c075e2a452 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -297,6 +297,21 @@ static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) return rv; } +static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct proc_dir_entry *pde = PDE(file_inode(iocb->ki_filp)); + ssize_t ret; + + if (pde_is_permanent(pde)) + return pde->proc_ops->proc_read_iter(iocb, iter); + + if (!use_pde(pde)) + return -EIO; + ret = pde->proc_ops->proc_read_iter(iocb, iter); + unuse_pde(pde); + return ret; +} + static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos) { typeof_member(struct proc_ops, proc_read) read; @@ -572,9 +587,18 @@ static const struct file_operations proc_reg_file_ops = { .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = proc_reg_compat_ioctl, -#endif + .mmap = proc_reg_mmap, + .get_unmapped_area = proc_reg_get_unmapped_area, + .open = proc_reg_open, + .release = proc_reg_release, +}; + +static const struct file_operations proc_iter_file_ops = { + .llseek = proc_reg_llseek, + .read_iter = proc_reg_read_iter, + .write = proc_reg_write, + .poll = proc_reg_poll, + .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, @@ -582,12 +606,26 @@ static const struct file_operations proc_reg_file_ops = { }; #ifdef CONFIG_COMPAT -static const struct file_operations proc_reg_file_ops_no_compat = { +static const struct file_operations proc_reg_file_ops_compat = { .llseek = proc_reg_llseek, .read = proc_reg_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, + .compat_ioctl = proc_reg_compat_ioctl, + .mmap = proc_reg_mmap, + .get_unmapped_area = proc_reg_get_unmapped_area, + .open = proc_reg_open, + .release = proc_reg_release, +}; + +static const struct file_operations proc_iter_file_ops_compat = { + .llseek = proc_reg_llseek, + .read_iter = proc_reg_read_iter, + .write = proc_reg_write, + .poll = proc_reg_poll, + .unlocked_ioctl = proc_reg_unlocked_ioctl, + .compat_ioctl = proc_reg_compat_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, @@ -619,42 +657,51 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { struct inode *inode = new_inode(sb); - if (inode) { - inode->i_ino = de->low_ino; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - PROC_I(inode)->pde = de; + if (!inode) { + pde_put(de); + return NULL; + } - if (is_empty_pde(de)) { - make_empty_dir_inode(inode); - return inode; - } - if (de->mode) { - inode->i_mode = de->mode; - inode->i_uid = de->uid; - inode->i_gid = de->gid; - } - if (de->size) - inode->i_size = de->size; - if (de->nlink) - set_nlink(inode, de->nlink); + inode->i_ino = de->low_ino; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + PROC_I(inode)->pde = de; + if (is_empty_pde(de)) { + make_empty_dir_inode(inode); + return inode; + } - if (S_ISREG(inode->i_mode)) { - inode->i_op = de->proc_iops; + if (de->mode) { + inode->i_mode = de->mode; + inode->i_uid = de->uid; + inode->i_gid = de->gid; + } + if (de->size) + inode->i_size = de->size; + if (de->nlink) + set_nlink(inode, de->nlink); + + if (S_ISREG(inode->i_mode)) { + inode->i_op = de->proc_iops; + if (de->proc_ops->proc_read_iter) + inode->i_fop = &proc_iter_file_ops; + else inode->i_fop = &proc_reg_file_ops; #ifdef CONFIG_COMPAT - if (!de->proc_ops->proc_compat_ioctl) { - inode->i_fop = &proc_reg_file_ops_no_compat; - } + if (de->proc_ops->proc_compat_ioctl) { + if (de->proc_ops->proc_read_iter) + inode->i_fop = &proc_iter_file_ops_compat; + else + inode->i_fop = &proc_reg_file_ops_compat; + } #endif - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = de->proc_iops; - inode->i_fop = de->proc_dir_ops; - } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = de->proc_iops; - inode->i_fop = NULL; - } else - BUG(); - } else - pde_put(de); + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = de->proc_iops; + inode->i_fop = de->proc_dir_ops; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = de->proc_iops; + inode->i_fop = NULL; + } else { + BUG(); + } return inode; } diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 6c1166ccdaea..317899222d7f 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -12,6 +12,7 @@ #include <linux/cred.h> #include <linux/namei.h> #include <linux/mm.h> +#include <linux/uio.h> #include <linux/module.h> #include <linux/bpf-cgroup.h> #include <linux/mount.h> @@ -540,13 +541,14 @@ out: return err; } -static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf, - size_t count, loff_t *ppos, int write) +static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, + int write) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(iocb->ki_filp); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; - void *kbuf; + size_t count = iov_iter_count(iter); + char *kbuf; ssize_t error; if (IS_ERR(head)) @@ -569,32 +571,30 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf, error = -ENOMEM; if (count >= KMALLOC_MAX_SIZE) goto out; + kbuf = kzalloc(count + 1, GFP_KERNEL); + if (!kbuf) + goto out; if (write) { - kbuf = memdup_user_nul(ubuf, count); - if (IS_ERR(kbuf)) { - error = PTR_ERR(kbuf); - goto out; - } - } else { - kbuf = kzalloc(count, GFP_KERNEL); - if (!kbuf) - goto out; + error = -EFAULT; + if (!copy_from_iter_full(kbuf, count, iter)) + goto out_free_buf; + kbuf[count] = '\0'; } error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count, - ppos); + &iocb->ki_pos); if (error) goto out_free_buf; /* careful: calling conventions are nasty here */ - error = table->proc_handler(table, write, kbuf, &count, ppos); + error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos); if (error) goto out_free_buf; if (!write) { error = -EFAULT; - if (copy_to_user(ubuf, kbuf, count)) + if (copy_to_iter(kbuf, count, iter) < count) goto out_free_buf; } @@ -607,16 +607,14 @@ out: return error; } -static ssize_t proc_sys_read(struct file *filp, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter) { - return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0); + return proc_sys_call_handler(iocb, iter, 0); } -static ssize_t proc_sys_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter) { - return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1); + return proc_sys_call_handler(iocb, iter, 1); } static int proc_sys_open(struct inode *inode, struct file *filp) @@ -853,8 +851,10 @@ static int proc_sys_getattr(const struct path *path, struct kstat *stat, static const struct file_operations proc_sys_file_operations = { .open = proc_sys_open, .poll = proc_sys_poll, - .read = proc_sys_read, - .write = proc_sys_write, + .read_iter = proc_sys_read, + .write_iter = proc_sys_write, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .llseek = default_llseek, }; diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 3059a9394c2d..e59d4bb3a89e 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -70,6 +70,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) { MNT_NOATIME, ",noatime" }, { MNT_NODIRATIME, ",nodiratime" }, { MNT_RELATIME, ",relatime" }, + { MNT_NOSYMFOLLOW, ",nosymfollow" }, { 0, NULL } }; const struct proc_fs_opts *fs_infop; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index e8da1cde87b9..3fb7fc819b4f 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -137,8 +137,7 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree = qnx4_count_free_blocks(sb); buf->f_bavail = buf->f_bfree; buf->f_namelen = QNX4_NAME_MAX; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 755293c8c71a..61191f7bdf62 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -166,8 +166,7 @@ static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_ffree = fs32_to_cpu(sbi, sbi->sb->sb_free_inodes); buf->f_bavail = buf->f_bfree; buf->f_namelen = QNX6_LONG_NAME_MAX; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/read_write.c b/fs/read_write.c index 19f5c4bf75aa..75f764b43418 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -419,27 +419,42 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo return ret; } +static int warn_unsupported(struct file *file, const char *op) +{ + pr_warn_ratelimited( + "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", + op, file, current->pid, current->comm); + return -EINVAL; +} + ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { - mm_segment_t old_fs = get_fs(); + struct kvec iov = { + .iov_base = buf, + .iov_len = min_t(size_t, count, MAX_RW_COUNT), + }; + struct kiocb kiocb; + struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) return -EINVAL; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; + /* + * Also fail if ->read_iter and ->read are both wired up as that + * implies very convoluted semantics. + */ + if (unlikely(!file->f_op->read_iter || file->f_op->read)) + return warn_unsupported(file, "read"); - if (count > MAX_RW_COUNT) - count = MAX_RW_COUNT; - set_fs(KERNEL_DS); - if (file->f_op->read) - ret = file->f_op->read(file, (void __user *)buf, count, pos); - else if (file->f_op->read_iter) - ret = new_sync_read(file, (void __user *)buf, count, pos); - else - ret = -EINVAL; - set_fs(old_fs); + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = pos ? *pos : 0; + iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len); + ret = file->f_op->read_iter(&kiocb, &iter); if (ret > 0) { + if (pos) + *pos = kiocb.ki_pos; fsnotify_access(file); add_rchar(current, ret); } @@ -510,28 +525,32 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { - mm_segment_t old_fs; - const char __user *p; + struct kvec iov = { + .iov_base = (void *)buf, + .iov_len = min_t(size_t, count, MAX_RW_COUNT), + }; + struct kiocb kiocb; + struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; + /* + * Also fail if ->write_iter and ->write are both wired up as that + * implies very convoluted semantics. + */ + if (unlikely(!file->f_op->write_iter || file->f_op->write)) + return warn_unsupported(file, "write"); - old_fs = get_fs(); - set_fs(KERNEL_DS); - p = (__force const char __user *)buf; - if (count > MAX_RW_COUNT) - count = MAX_RW_COUNT; - if (file->f_op->write) - ret = file->f_op->write(file, p, count, pos); - else if (file->f_op->write_iter) - ret = new_sync_write(file, p, count, pos); - else - ret = -EINVAL; - set_fs(old_fs); + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = pos ? *pos : 0; + iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); + ret = file->f_op->write_iter(&kiocb, &iter); if (ret > 0) { + if (pos) + *pos = kiocb.ki_pos; fsnotify_modify(file); add_wchar(current, ret); } @@ -889,7 +908,7 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, } EXPORT_SYMBOL(vfs_iter_write); -ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, +static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; @@ -1392,6 +1411,59 @@ static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in, } /* + * Performs necessary checks before doing a file copy + * + * Can adjust amount of bytes to copy via @req_count argument. + * Returns appropriate error code that caller should return or + * zero in case the copy should be allowed. + */ +static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t *req_count, unsigned int flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + uint64_t count = *req_count; + loff_t size_in; + int ret; + + ret = generic_file_rw_checks(file_in, file_out); + if (ret) + return ret; + + /* Don't touch certain kinds of inodes */ + if (IS_IMMUTABLE(inode_out)) + return -EPERM; + + if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) + return -ETXTBSY; + + /* Ensure offsets don't wrap. */ + if (pos_in + count < pos_in || pos_out + count < pos_out) + return -EOVERFLOW; + + /* Shorten the copy to EOF */ + size_in = i_size_read(inode_in); + if (pos_in >= size_in) + count = 0; + else + count = min(count, size_in - (uint64_t)pos_in); + + ret = generic_write_check_limits(file_out, pos_out, &count); + if (ret) + return ret; + + /* Don't allow overlapped copying within the same file. */ + if (inode_in == inode_out && + pos_out + count > pos_in && + pos_out < pos_in + count) + return -EINVAL; + + *req_count = count; + return 0; +} + +/* * copy_file_range() differs from regular file read and write in that it * specifically allows return partial success. When it does so is up to * the copy_file_range method. @@ -1523,475 +1595,92 @@ out2: return ret; } -static int remap_verify_area(struct file *file, loff_t pos, loff_t len, - bool write) -{ - struct inode *inode = file_inode(file); - - if (unlikely(pos < 0 || len < 0)) - return -EINVAL; - - if (unlikely((loff_t) (pos + len) < 0)) - return -EINVAL; - - if (unlikely(inode->i_flctx && mandatory_lock(inode))) { - loff_t end = len ? pos + len - 1 : OFFSET_MAX; - int retval; - - retval = locks_mandatory_area(inode, file, pos, end, - write ? F_WRLCK : F_RDLCK); - if (retval < 0) - return retval; - } - - return security_file_permission(file, write ? MAY_WRITE : MAY_READ); -} -/* - * Ensure that we don't remap a partial EOF block in the middle of something - * else. Assume that the offsets have already been checked for block - * alignment. - * - * For clone we only link a partial EOF block above or at the destination file's - * EOF. For deduplication we accept a partial EOF block only if it ends at the - * destination file's EOF (can not link it into the middle of a file). - * - * Shorten the request if possible. - */ -static int generic_remap_check_len(struct inode *inode_in, - struct inode *inode_out, - loff_t pos_out, - loff_t *len, - unsigned int remap_flags) -{ - u64 blkmask = i_blocksize(inode_in) - 1; - loff_t new_len = *len; - - if ((*len & blkmask) == 0) - return 0; - - if (pos_out + *len < i_size_read(inode_out)) - new_len &= ~blkmask; - - if (new_len == *len) - return 0; - - if (remap_flags & REMAP_FILE_CAN_SHORTEN) { - *len = new_len; - return 0; - } - - return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; -} - -/* Read a page's worth of file data into the page cache. */ -static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) -{ - struct page *page; - - page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); - return ERR_PTR(-EIO); - } - return page; -} - /* - * Lock two pages, ensuring that we lock in offset order if the pages are from - * the same file. + * Don't operate on ranges the page cache doesn't support, and don't exceed the + * LFS limits. If pos is under the limit it becomes a short access. If it + * exceeds the limit we return -EFBIG. */ -static void vfs_lock_two_pages(struct page *page1, struct page *page2) +int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) { - /* Always lock in order of increasing index. */ - if (page1->index > page2->index) - swap(page1, page2); - - lock_page(page1); - if (page1 != page2) - lock_page(page2); -} - -/* Unlock two pages, being careful not to unlock the same page twice. */ -static void vfs_unlock_two_pages(struct page *page1, struct page *page2) -{ - unlock_page(page1); - if (page1 != page2) - unlock_page(page2); -} - -/* - * Compare extents of two files to see if they are the same. - * Caller must have locked both inodes to prevent write races. - */ -static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, - struct inode *dest, loff_t destoff, - loff_t len, bool *is_same) -{ - loff_t src_poff; - loff_t dest_poff; - void *src_addr; - void *dest_addr; - struct page *src_page; - struct page *dest_page; - loff_t cmp_len; - bool same; - int error; - - error = -EINVAL; - same = true; - while (len) { - src_poff = srcoff & (PAGE_SIZE - 1); - dest_poff = destoff & (PAGE_SIZE - 1); - cmp_len = min(PAGE_SIZE - src_poff, - PAGE_SIZE - dest_poff); - cmp_len = min(cmp_len, len); - if (cmp_len <= 0) - goto out_error; - - src_page = vfs_dedupe_get_page(src, srcoff); - if (IS_ERR(src_page)) { - error = PTR_ERR(src_page); - goto out_error; - } - dest_page = vfs_dedupe_get_page(dest, destoff); - if (IS_ERR(dest_page)) { - error = PTR_ERR(dest_page); - put_page(src_page); - goto out_error; - } - - vfs_lock_two_pages(src_page, dest_page); + struct inode *inode = file->f_mapping->host; + loff_t max_size = inode->i_sb->s_maxbytes; + loff_t limit = rlimit(RLIMIT_FSIZE); - /* - * Now that we've locked both pages, make sure they're still - * mapped to the file data we're interested in. If not, - * someone is invalidating pages on us and we lose. - */ - if (!PageUptodate(src_page) || !PageUptodate(dest_page) || - src_page->mapping != src->i_mapping || - dest_page->mapping != dest->i_mapping) { - same = false; - goto unlock; + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; } + *count = min(*count, limit - pos); + } - src_addr = kmap_atomic(src_page); - dest_addr = kmap_atomic(dest_page); - - flush_dcache_page(src_page); - flush_dcache_page(dest_page); - - if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) - same = false; + if (!(file->f_flags & O_LARGEFILE)) + max_size = MAX_NON_LFS; - kunmap_atomic(dest_addr); - kunmap_atomic(src_addr); -unlock: - vfs_unlock_two_pages(src_page, dest_page); - put_page(dest_page); - put_page(src_page); + if (unlikely(pos >= max_size)) + return -EFBIG; - if (!same) - break; + *count = min(*count, max_size - pos); - srcoff += cmp_len; - destoff += cmp_len; - len -= cmp_len; - } - - *is_same = same; return 0; - -out_error: - return error; } /* - * Check that the two inodes are eligible for cloning, the ranges make - * sense, and then flush all dirty data. Caller must ensure that the - * inodes have been locked against any other modifications. + * Performs necessary checks before doing a write * - * If there's an error, then the usual negative error code is returned. - * Otherwise returns 0 with *len set to the request length. + * Can adjust writing position or amount of bytes to write. + * Returns appropriate error code that caller should return or + * zero in case that write should be allowed. */ -int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) +ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - bool same_inode = (inode_in == inode_out); + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + loff_t count; int ret; - /* Don't touch certain kinds of inodes */ - if (IS_IMMUTABLE(inode_out)) - return -EPERM; - - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) + if (IS_SWAPFILE(inode)) return -ETXTBSY; - /* Don't reflink dirs, pipes, sockets... */ - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; - - /* Zero length dedupe exits immediately; reflink goes to EOF. */ - if (*len == 0) { - loff_t isize = i_size_read(inode_in); - - if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) - return 0; - if (pos_in > isize) - return -EINVAL; - *len = isize - pos_in; - if (*len == 0) - return 0; - } - - /* Check that we don't violate system file offset limits. */ - ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* Wait for the completion of any pending IOs on both files */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + *len - 1); - if (ret) - return ret; - - ret = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + *len - 1); - if (ret) - return ret; - - /* - * Check that the extents are the same. - */ - if (remap_flags & REMAP_FILE_DEDUP) { - bool is_same = false; - - ret = vfs_dedupe_file_range_compare(inode_in, pos_in, - inode_out, pos_out, *len, &is_same); - if (ret) - return ret; - if (!is_same) - return -EBADE; - } - - ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* If can't alter the file contents, we're done. */ - if (!(remap_flags & REMAP_FILE_DEDUP)) - ret = file_modified(file_out); - - return ret; -} -EXPORT_SYMBOL(generic_remap_file_range_prep); - -loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags) -{ - loff_t ret; - - WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP); - - /* - * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on - * the same mount. Practically, they only need to be on the same file - * system. - */ - if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) - return -EXDEV; - - ret = generic_file_rw_checks(file_in, file_out); - if (ret < 0) - return ret; - - if (!file_in->f_op->remap_file_range) - return -EOPNOTSUPP; - - ret = remap_verify_area(file_in, pos_in, len, false); - if (ret) - return ret; - - ret = remap_verify_area(file_out, pos_out, len, true); - if (ret) - return ret; - - ret = file_in->f_op->remap_file_range(file_in, pos_in, - file_out, pos_out, len, remap_flags); - if (ret < 0) - return ret; - - fsnotify_access(file_in); - fsnotify_modify(file_out); - return ret; -} -EXPORT_SYMBOL(do_clone_file_range); - -loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags) -{ - loff_t ret; - - file_start_write(file_out); - ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, - remap_flags); - file_end_write(file_out); - - return ret; -} -EXPORT_SYMBOL(vfs_clone_file_range); - -/* Check whether we are allowed to dedupe the destination file */ -static bool allow_file_dedupe(struct file *file) -{ - if (capable(CAP_SYS_ADMIN)) - return true; - if (file->f_mode & FMODE_WRITE) - return true; - if (uid_eq(current_fsuid(), file_inode(file)->i_uid)) - return true; - if (!inode_permission(file_inode(file), MAY_WRITE)) - return true; - return false; -} + if (!iov_iter_count(from)) + return 0; -loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, - struct file *dst_file, loff_t dst_pos, - loff_t len, unsigned int remap_flags) -{ - loff_t ret; + /* FIXME: this is for backwards compatibility with 2.4 */ + if (iocb->ki_flags & IOCB_APPEND) + iocb->ki_pos = i_size_read(inode); - WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | - REMAP_FILE_CAN_SHORTEN)); + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; - ret = mnt_want_write_file(dst_file); + count = iov_iter_count(from); + ret = generic_write_check_limits(file, iocb->ki_pos, &count); if (ret) return ret; - ret = remap_verify_area(dst_file, dst_pos, len, true); - if (ret < 0) - goto out_drop_write; - - ret = -EPERM; - if (!allow_file_dedupe(dst_file)) - goto out_drop_write; - - ret = -EXDEV; - if (src_file->f_path.mnt != dst_file->f_path.mnt) - goto out_drop_write; - - ret = -EISDIR; - if (S_ISDIR(file_inode(dst_file)->i_mode)) - goto out_drop_write; - - ret = -EINVAL; - if (!dst_file->f_op->remap_file_range) - goto out_drop_write; - - if (len == 0) { - ret = 0; - goto out_drop_write; - } - - ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, - dst_pos, len, remap_flags | REMAP_FILE_DEDUP); -out_drop_write: - mnt_drop_write_file(dst_file); - - return ret; + iov_iter_truncate(from, count); + return iov_iter_count(from); } -EXPORT_SYMBOL(vfs_dedupe_file_range_one); +EXPORT_SYMBOL(generic_write_checks); -int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) +/* + * Performs common checks before doing a file copy/clone + * from @file_in to @file_out. + */ +int generic_file_rw_checks(struct file *file_in, struct file *file_out) { - struct file_dedupe_range_info *info; - struct inode *src = file_inode(file); - u64 off; - u64 len; - int i; - int ret; - u16 count = same->dest_count; - loff_t deduped; - - if (!(file->f_mode & FMODE_READ)) - return -EINVAL; - - if (same->reserved1 || same->reserved2) - return -EINVAL; - - off = same->src_offset; - len = same->src_length; + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); - if (S_ISDIR(src->i_mode)) + /* Don't copy dirs, pipes, sockets... */ + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; - - if (!S_ISREG(src->i_mode)) - return -EINVAL; - - if (!file->f_op->remap_file_range) - return -EOPNOTSUPP; - - ret = remap_verify_area(file, off, len, false); - if (ret < 0) - return ret; - ret = 0; - - if (off + len > i_size_read(src)) + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; - /* Arbitrary 1G limit on a single dedupe request, can be raised. */ - len = min_t(u64, len, 1 << 30); - - /* pre-format output fields to sane values */ - for (i = 0; i < count; i++) { - same->info[i].bytes_deduped = 0ULL; - same->info[i].status = FILE_DEDUPE_RANGE_SAME; - } - - for (i = 0, info = same->info; i < count; i++, info++) { - struct fd dst_fd = fdget(info->dest_fd); - struct file *dst_file = dst_fd.file; - - if (!dst_file) { - info->status = -EBADF; - goto next_loop; - } - - if (info->reserved) { - info->status = -EINVAL; - goto next_fdput; - } - - deduped = vfs_dedupe_file_range_one(file, off, dst_file, - info->dest_offset, len, - REMAP_FILE_CAN_SHORTEN); - if (deduped == -EBADE) - info->status = FILE_DEDUPE_RANGE_DIFFERS; - else if (deduped < 0) - info->status = deduped; - else - info->bytes_deduped = len; + if (!(file_in->f_mode & FMODE_READ) || + !(file_out->f_mode & FMODE_WRITE) || + (file_out->f_flags & O_APPEND)) + return -EBADF; -next_fdput: - fdput(dst_fd); -next_loop: - if (fatal_signal_pending(current)) - break; - } - return ret; + return 0; } -EXPORT_SYMBOL(vfs_dedupe_file_range); diff --git a/fs/remap_range.c b/fs/remap_range.c new file mode 100644 index 000000000000..e6099beefa97 --- /dev/null +++ b/fs/remap_range.c @@ -0,0 +1,571 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/slab.h> +#include <linux/stat.h> +#include <linux/sched/xacct.h> +#include <linux/fcntl.h> +#include <linux/file.h> +#include <linux/uio.h> +#include <linux/fsnotify.h> +#include <linux/security.h> +#include <linux/export.h> +#include <linux/syscalls.h> +#include <linux/pagemap.h> +#include <linux/splice.h> +#include <linux/compat.h> +#include <linux/mount.h> +#include <linux/fs.h> +#include "internal.h" + +#include <linux/uaccess.h> +#include <asm/unistd.h> + +/* + * Performs necessary checks before doing a clone. + * + * Can adjust amount of bytes to clone via @req_count argument. + * Returns appropriate error code that caller should return or + * zero in case the clone should be allowed. + */ +static int generic_remap_checks(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *req_count, unsigned int remap_flags) +{ + struct inode *inode_in = file_in->f_mapping->host; + struct inode *inode_out = file_out->f_mapping->host; + uint64_t count = *req_count; + uint64_t bcount; + loff_t size_in, size_out; + loff_t bs = inode_out->i_sb->s_blocksize; + int ret; + + /* The start of both ranges must be aligned to an fs block. */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) + return -EINVAL; + + /* Ensure offsets don't wrap. */ + if (pos_in + count < pos_in || pos_out + count < pos_out) + return -EINVAL; + + size_in = i_size_read(inode_in); + size_out = i_size_read(inode_out); + + /* Dedupe requires both ranges to be within EOF. */ + if ((remap_flags & REMAP_FILE_DEDUP) && + (pos_in >= size_in || pos_in + count > size_in || + pos_out >= size_out || pos_out + count > size_out)) + return -EINVAL; + + /* Ensure the infile range is within the infile. */ + if (pos_in >= size_in) + return -EINVAL; + count = min(count, size_in - (uint64_t)pos_in); + + ret = generic_write_check_limits(file_out, pos_out, &count); + if (ret) + return ret; + + /* + * If the user wanted us to link to the infile's EOF, round up to the + * next block boundary for this check. + * + * Otherwise, make sure the count is also block-aligned, having + * already confirmed the starting offsets' block alignment. + */ + if (pos_in + count == size_in) { + bcount = ALIGN(size_in, bs) - pos_in; + } else { + if (!IS_ALIGNED(count, bs)) + count = ALIGN_DOWN(count, bs); + bcount = count; + } + + /* Don't allow overlapped cloning within the same file. */ + if (inode_in == inode_out && + pos_out + bcount > pos_in && + pos_out < pos_in + bcount) + return -EINVAL; + + /* + * We shortened the request but the caller can't deal with that, so + * bounce the request back to userspace. + */ + if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) + return -EINVAL; + + *req_count = count; + return 0; +} + +static int remap_verify_area(struct file *file, loff_t pos, loff_t len, + bool write) +{ + struct inode *inode = file_inode(file); + + if (unlikely(pos < 0 || len < 0)) + return -EINVAL; + + if (unlikely((loff_t) (pos + len) < 0)) + return -EINVAL; + + if (unlikely(inode->i_flctx && mandatory_lock(inode))) { + loff_t end = len ? pos + len - 1 : OFFSET_MAX; + int retval; + + retval = locks_mandatory_area(inode, file, pos, end, + write ? F_WRLCK : F_RDLCK); + if (retval < 0) + return retval; + } + + return security_file_permission(file, write ? MAY_WRITE : MAY_READ); +} + +/* + * Ensure that we don't remap a partial EOF block in the middle of something + * else. Assume that the offsets have already been checked for block + * alignment. + * + * For clone we only link a partial EOF block above or at the destination file's + * EOF. For deduplication we accept a partial EOF block only if it ends at the + * destination file's EOF (can not link it into the middle of a file). + * + * Shorten the request if possible. + */ +static int generic_remap_check_len(struct inode *inode_in, + struct inode *inode_out, + loff_t pos_out, + loff_t *len, + unsigned int remap_flags) +{ + u64 blkmask = i_blocksize(inode_in) - 1; + loff_t new_len = *len; + + if ((*len & blkmask) == 0) + return 0; + + if (pos_out + *len < i_size_read(inode_out)) + new_len &= ~blkmask; + + if (new_len == *len) + return 0; + + if (remap_flags & REMAP_FILE_CAN_SHORTEN) { + *len = new_len; + return 0; + } + + return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; +} + +/* Read a page's worth of file data into the page cache. */ +static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) +{ + struct page *page; + + page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + return page; +} + +/* + * Lock two pages, ensuring that we lock in offset order if the pages are from + * the same file. + */ +static void vfs_lock_two_pages(struct page *page1, struct page *page2) +{ + /* Always lock in order of increasing index. */ + if (page1->index > page2->index) + swap(page1, page2); + + lock_page(page1); + if (page1 != page2) + lock_page(page2); +} + +/* Unlock two pages, being careful not to unlock the same page twice. */ +static void vfs_unlock_two_pages(struct page *page1, struct page *page2) +{ + unlock_page(page1); + if (page1 != page2) + unlock_page(page2); +} + +/* + * Compare extents of two files to see if they are the same. + * Caller must have locked both inodes to prevent write races. + */ +static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same) +{ + loff_t src_poff; + loff_t dest_poff; + void *src_addr; + void *dest_addr; + struct page *src_page; + struct page *dest_page; + loff_t cmp_len; + bool same; + int error; + + error = -EINVAL; + same = true; + while (len) { + src_poff = srcoff & (PAGE_SIZE - 1); + dest_poff = destoff & (PAGE_SIZE - 1); + cmp_len = min(PAGE_SIZE - src_poff, + PAGE_SIZE - dest_poff); + cmp_len = min(cmp_len, len); + if (cmp_len <= 0) + goto out_error; + + src_page = vfs_dedupe_get_page(src, srcoff); + if (IS_ERR(src_page)) { + error = PTR_ERR(src_page); + goto out_error; + } + dest_page = vfs_dedupe_get_page(dest, destoff); + if (IS_ERR(dest_page)) { + error = PTR_ERR(dest_page); + put_page(src_page); + goto out_error; + } + + vfs_lock_two_pages(src_page, dest_page); + + /* + * Now that we've locked both pages, make sure they're still + * mapped to the file data we're interested in. If not, + * someone is invalidating pages on us and we lose. + */ + if (!PageUptodate(src_page) || !PageUptodate(dest_page) || + src_page->mapping != src->i_mapping || + dest_page->mapping != dest->i_mapping) { + same = false; + goto unlock; + } + + src_addr = kmap_atomic(src_page); + dest_addr = kmap_atomic(dest_page); + + flush_dcache_page(src_page); + flush_dcache_page(dest_page); + + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + same = false; + + kunmap_atomic(dest_addr); + kunmap_atomic(src_addr); +unlock: + vfs_unlock_two_pages(src_page, dest_page); + put_page(dest_page); + put_page(src_page); + + if (!same) + break; + + srcoff += cmp_len; + destoff += cmp_len; + len -= cmp_len; + } + + *is_same = same; + return 0; + +out_error: + return error; +} + +/* + * Check that the two inodes are eligible for cloning, the ranges make + * sense, and then flush all dirty data. Caller must ensure that the + * inodes have been locked against any other modifications. + * + * If there's an error, then the usual negative error code is returned. + * Otherwise returns 0 with *len set to the request length. + */ +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + bool same_inode = (inode_in == inode_out); + int ret; + + /* Don't touch certain kinds of inodes */ + if (IS_IMMUTABLE(inode_out)) + return -EPERM; + + if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) + return -ETXTBSY; + + /* Don't reflink dirs, pipes, sockets... */ + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + + /* Zero length dedupe exits immediately; reflink goes to EOF. */ + if (*len == 0) { + loff_t isize = i_size_read(inode_in); + + if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) + return 0; + if (pos_in > isize) + return -EINVAL; + *len = isize - pos_in; + if (*len == 0) + return 0; + } + + /* Check that we don't violate system file offset limits. */ + ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, + remap_flags); + if (ret) + return ret; + + /* Wait for the completion of any pending IOs on both files */ + inode_dio_wait(inode_in); + if (!same_inode) + inode_dio_wait(inode_out); + + ret = filemap_write_and_wait_range(inode_in->i_mapping, + pos_in, pos_in + *len - 1); + if (ret) + return ret; + + ret = filemap_write_and_wait_range(inode_out->i_mapping, + pos_out, pos_out + *len - 1); + if (ret) + return ret; + + /* + * Check that the extents are the same. + */ + if (remap_flags & REMAP_FILE_DEDUP) { + bool is_same = false; + + ret = vfs_dedupe_file_range_compare(inode_in, pos_in, + inode_out, pos_out, *len, &is_same); + if (ret) + return ret; + if (!is_same) + return -EBADE; + } + + ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, + remap_flags); + if (ret) + return ret; + + /* If can't alter the file contents, we're done. */ + if (!(remap_flags & REMAP_FILE_DEDUP)) + ret = file_modified(file_out); + + return ret; +} +EXPORT_SYMBOL(generic_remap_file_range_prep); + +loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) +{ + loff_t ret; + + WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP); + + /* + * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on + * the same mount. Practically, they only need to be on the same file + * system. + */ + if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) + return -EXDEV; + + ret = generic_file_rw_checks(file_in, file_out); + if (ret < 0) + return ret; + + if (!file_in->f_op->remap_file_range) + return -EOPNOTSUPP; + + ret = remap_verify_area(file_in, pos_in, len, false); + if (ret) + return ret; + + ret = remap_verify_area(file_out, pos_out, len, true); + if (ret) + return ret; + + ret = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, len, remap_flags); + if (ret < 0) + return ret; + + fsnotify_access(file_in); + fsnotify_modify(file_out); + return ret; +} +EXPORT_SYMBOL(do_clone_file_range); + +loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) +{ + loff_t ret; + + file_start_write(file_out); + ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, + remap_flags); + file_end_write(file_out); + + return ret; +} +EXPORT_SYMBOL(vfs_clone_file_range); + +/* Check whether we are allowed to dedupe the destination file */ +static bool allow_file_dedupe(struct file *file) +{ + if (capable(CAP_SYS_ADMIN)) + return true; + if (file->f_mode & FMODE_WRITE) + return true; + if (uid_eq(current_fsuid(), file_inode(file)->i_uid)) + return true; + if (!inode_permission(file_inode(file), MAY_WRITE)) + return true; + return false; +} + +loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, + struct file *dst_file, loff_t dst_pos, + loff_t len, unsigned int remap_flags) +{ + loff_t ret; + + WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | + REMAP_FILE_CAN_SHORTEN)); + + ret = mnt_want_write_file(dst_file); + if (ret) + return ret; + + ret = remap_verify_area(dst_file, dst_pos, len, true); + if (ret < 0) + goto out_drop_write; + + ret = -EPERM; + if (!allow_file_dedupe(dst_file)) + goto out_drop_write; + + ret = -EXDEV; + if (src_file->f_path.mnt != dst_file->f_path.mnt) + goto out_drop_write; + + ret = -EISDIR; + if (S_ISDIR(file_inode(dst_file)->i_mode)) + goto out_drop_write; + + ret = -EINVAL; + if (!dst_file->f_op->remap_file_range) + goto out_drop_write; + + if (len == 0) { + ret = 0; + goto out_drop_write; + } + + ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, + dst_pos, len, remap_flags | REMAP_FILE_DEDUP); +out_drop_write: + mnt_drop_write_file(dst_file); + + return ret; +} +EXPORT_SYMBOL(vfs_dedupe_file_range_one); + +int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) +{ + struct file_dedupe_range_info *info; + struct inode *src = file_inode(file); + u64 off; + u64 len; + int i; + int ret; + u16 count = same->dest_count; + loff_t deduped; + + if (!(file->f_mode & FMODE_READ)) + return -EINVAL; + + if (same->reserved1 || same->reserved2) + return -EINVAL; + + off = same->src_offset; + len = same->src_length; + + if (S_ISDIR(src->i_mode)) + return -EISDIR; + + if (!S_ISREG(src->i_mode)) + return -EINVAL; + + if (!file->f_op->remap_file_range) + return -EOPNOTSUPP; + + ret = remap_verify_area(file, off, len, false); + if (ret < 0) + return ret; + ret = 0; + + if (off + len > i_size_read(src)) + return -EINVAL; + + /* Arbitrary 1G limit on a single dedupe request, can be raised. */ + len = min_t(u64, len, 1 << 30); + + /* pre-format output fields to sane values */ + for (i = 0; i < count; i++) { + same->info[i].bytes_deduped = 0ULL; + same->info[i].status = FILE_DEDUPE_RANGE_SAME; + } + + for (i = 0, info = same->info; i < count; i++, info++) { + struct fd dst_fd = fdget(info->dest_fd); + struct file *dst_file = dst_fd.file; + + if (!dst_file) { + info->status = -EBADF; + goto next_loop; + } + + if (info->reserved) { + info->status = -EINVAL; + goto next_fdput; + } + + deduped = vfs_dedupe_file_range_one(file, off, dst_file, + info->dest_offset, len, + REMAP_FILE_CAN_SHORTEN); + if (deduped == -EBADE) + info->status = FILE_DEDUPE_RANGE_DIFFERS; + else if (deduped < 0) + info->status = deduped; + else + info->bytes_deduped = len; + +next_fdput: + fdput(dst_fd); +next_loop: + if (fatal_signal_pending(current)) + break; + } + return ret; +} +EXPORT_SYMBOL(vfs_dedupe_file_range); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index b1b7d3f5752f..259f684d9236 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -416,8 +416,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree = buf->f_bavail = buf->f_ffree; buf->f_blocks = (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/splice.c b/fs/splice.c index 70cc52af780b..866d5c2367b2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -341,89 +341,6 @@ const struct pipe_buf_operations nosteal_pipe_buf_ops = { }; EXPORT_SYMBOL(nosteal_pipe_buf_ops); -static ssize_t kernel_readv(struct file *file, const struct kvec *vec, - unsigned long vlen, loff_t offset) -{ - mm_segment_t old_fs; - loff_t pos = offset; - ssize_t res; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - /* The cast to a user pointer is valid due to the set_fs() */ - res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0); - set_fs(old_fs); - - return res; -} - -static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - struct kvec *vec, __vec[PIPE_DEF_BUFFERS]; - struct iov_iter to; - struct page **pages; - unsigned int nr_pages; - unsigned int mask; - size_t offset, base, copied = 0; - ssize_t res; - int i; - - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) - return -EAGAIN; - - /* - * Try to keep page boundaries matching to source pagecache ones - - * it probably won't be much help, but... - */ - offset = *ppos & ~PAGE_MASK; - - iov_iter_pipe(&to, READ, pipe, len + offset); - - res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base); - if (res <= 0) - return -ENOMEM; - - nr_pages = DIV_ROUND_UP(res + base, PAGE_SIZE); - - vec = __vec; - if (nr_pages > PIPE_DEF_BUFFERS) { - vec = kmalloc_array(nr_pages, sizeof(struct kvec), GFP_KERNEL); - if (unlikely(!vec)) { - res = -ENOMEM; - goto out; - } - } - - mask = pipe->ring_size - 1; - pipe->bufs[to.head & mask].offset = offset; - pipe->bufs[to.head & mask].len -= offset; - - for (i = 0; i < nr_pages; i++) { - size_t this_len = min_t(size_t, len, PAGE_SIZE - offset); - vec[i].iov_base = page_address(pages[i]) + offset; - vec[i].iov_len = this_len; - len -= this_len; - offset = 0; - } - - res = kernel_readv(in, vec, nr_pages, *ppos); - if (res > 0) { - copied = res; - *ppos += res; - } - - if (vec != __vec) - kfree(vec); -out: - for (i = 0; i < nr_pages; i++) - put_page(pages[i]); - kvfree(pages); - iov_iter_advance(&to, copied); /* truncates and discards */ - return res; -} - /* * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' * using sendpage(). Return the number of bytes sent. @@ -807,33 +724,6 @@ done: EXPORT_SYMBOL(iter_file_splice_write); -static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, - struct splice_desc *sd) -{ - int ret; - void *data; - loff_t tmp = sd->pos; - - data = kmap(buf->page); - ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); - kunmap(buf->page); - - return ret; -} - -static ssize_t default_file_splice_write(struct pipe_inode_info *pipe, - struct file *out, loff_t *ppos, - size_t len, unsigned int flags) -{ - ssize_t ret; - - ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf); - if (ret > 0) - *ppos += ret; - - return ret; -} - /** * generic_splice_sendpage - splice data from a pipe to a socket * @pipe: pipe to splice from @@ -855,15 +745,23 @@ ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, EXPORT_SYMBOL(generic_splice_sendpage); +static int warn_unsupported(struct file *file, const char *op) +{ + pr_debug_ratelimited( + "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n", + op, file, current->pid, current->comm); + return -EINVAL; +} + /* * Attempt to initiate a splice from pipe to file. */ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { - if (out->f_op->splice_write) - return out->f_op->splice_write(pipe, out, ppos, len, flags); - return default_file_splice_write(pipe, out, ppos, len, flags); + if (unlikely(!out->f_op->splice_write)) + return warn_unsupported(out, "write"); + return out->f_op->splice_write(pipe, out, ppos, len, flags); } /* @@ -885,9 +783,9 @@ static long do_splice_to(struct file *in, loff_t *ppos, if (unlikely(len > MAX_RW_COUNT)) len = MAX_RW_COUNT; - if (in->f_op->splice_read) - return in->f_op->splice_read(in, ppos, pipe, len, flags); - return default_file_splice_read(in, ppos, pipe, len, flags); + if (unlikely(!in->f_op->splice_read)) + return warn_unsupported(in, "read"); + return in->f_op->splice_read(in, ppos, pipe, len, flags); } /** @@ -1107,9 +1005,8 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, /* * Determine where to splice to/from. */ -long do_splice(struct file *in, loff_t __user *off_in, - struct file *out, loff_t __user *off_out, - size_t len, unsigned int flags) +long do_splice(struct file *in, loff_t *off_in, struct file *out, + loff_t *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; @@ -1143,8 +1040,7 @@ long do_splice(struct file *in, loff_t __user *off_in, if (off_out) { if (!(out->f_mode & FMODE_PWRITE)) return -EINVAL; - if (copy_from_user(&offset, off_out, sizeof(loff_t))) - return -EFAULT; + offset = *off_out; } else { offset = out->f_pos; } @@ -1165,8 +1061,8 @@ long do_splice(struct file *in, loff_t __user *off_in, if (!off_out) out->f_pos = offset; - else if (copy_to_user(off_out, &offset, sizeof(loff_t))) - ret = -EFAULT; + else + *off_out = offset; return ret; } @@ -1177,8 +1073,7 @@ long do_splice(struct file *in, loff_t __user *off_in, if (off_in) { if (!(in->f_mode & FMODE_PREAD)) return -EINVAL; - if (copy_from_user(&offset, off_in, sizeof(loff_t))) - return -EFAULT; + offset = *off_in; } else { offset = in->f_pos; } @@ -1202,8 +1097,8 @@ long do_splice(struct file *in, loff_t __user *off_in, wakeup_pipe_readers(opipe); if (!off_in) in->f_pos = offset; - else if (copy_to_user(off_in, &offset, sizeof(loff_t))) - ret = -EFAULT; + else + *off_in = offset; return ret; } @@ -1211,6 +1106,46 @@ long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; } +static long __do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags) +{ + struct pipe_inode_info *ipipe; + struct pipe_inode_info *opipe; + loff_t offset, *__off_in = NULL, *__off_out = NULL; + long ret; + + ipipe = get_pipe_info(in, true); + opipe = get_pipe_info(out, true); + + if (ipipe && off_in) + return -ESPIPE; + if (opipe && off_out) + return -ESPIPE; + + if (off_out) { + if (copy_from_user(&offset, off_out, sizeof(loff_t))) + return -EFAULT; + __off_out = &offset; + } + if (off_in) { + if (copy_from_user(&offset, off_in, sizeof(loff_t))) + return -EFAULT; + __off_in = &offset; + } + + ret = do_splice(in, __off_in, out, __off_out, len, flags); + if (ret < 0) + return ret; + + if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t))) + return -EFAULT; + if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t))) + return -EFAULT; + + return ret; +} + static int iter_to_pipe(struct iov_iter *from, struct pipe_inode_info *pipe, unsigned flags) @@ -1405,8 +1340,8 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, if (in.file) { out = fdget(fd_out); if (out.file) { - error = do_splice(in.file, off_in, out.file, off_out, - len, flags); + error = __do_splice(in.file, off_in, out.file, off_out, + len, flags); fdput(out); } fdput(in); diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 0cc4ceec0562..d6c6593ec169 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -380,8 +380,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = msblk->inodes; buf->f_ffree = 0; buf->f_namelen = SQUASHFS_NAME_LEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/stat.c b/fs/stat.c index 44f8ad346db4..dacecdda2e79 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -56,7 +56,7 @@ EXPORT_SYMBOL(generic_fillattr); * @path: file to get attributes from * @stat: structure to return attributes in * @request_mask: STATX_xxx flags indicating what the caller wants - * @query_flags: Query mode (KSTAT_QUERY_FLAGS) + * @query_flags: Query mode (AT_STATX_SYNC_TYPE) * * Get attributes without calling security_inode_getattr. * @@ -71,7 +71,7 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, memset(stat, 0, sizeof(*stat)); stat->result_mask |= STATX_BASIC_STATS; - query_flags &= KSTAT_QUERY_FLAGS; + query_flags &= AT_STATX_SYNC_TYPE; /* allow the fs to override these if it really wants to */ /* SB_NOATIME means filesystem supplies dummy atime value */ @@ -97,7 +97,7 @@ EXPORT_SYMBOL(vfs_getattr_nosec); * @path: The file of interest * @stat: Where to return the statistics * @request_mask: STATX_xxx flags indicating what the caller wants - * @query_flags: Query mode (KSTAT_QUERY_FLAGS) + * @query_flags: Query mode (AT_STATX_SYNC_TYPE) * * Ask the filesystem for a file's attributes. The caller must indicate in * request_mask and query_flags to indicate what they want. @@ -126,53 +126,27 @@ int vfs_getattr(const struct path *path, struct kstat *stat, EXPORT_SYMBOL(vfs_getattr); /** - * vfs_statx_fd - Get the enhanced basic attributes by file descriptor + * vfs_fstat - Get the basic attributes by file descriptor * @fd: The file descriptor referring to the file of interest * @stat: The result structure to fill in. - * @request_mask: STATX_xxx flags indicating what the caller wants - * @query_flags: Query mode (KSTAT_QUERY_FLAGS) * * This function is a wrapper around vfs_getattr(). The main difference is * that it uses a file descriptor to determine the file location. * * 0 will be returned on success, and a -ve error code if unsuccessful. */ -int vfs_statx_fd(unsigned int fd, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int vfs_fstat(int fd, struct kstat *stat) { struct fd f; - int error = -EBADF; - - if (query_flags & ~KSTAT_QUERY_FLAGS) - return -EINVAL; + int error; f = fdget_raw(fd); - if (f.file) { - error = vfs_getattr(&f.file->f_path, stat, - request_mask, query_flags); - fdput(f); - } + if (!f.file) + return -EBADF; + error = vfs_getattr(&f.file->f_path, stat, STATX_BASIC_STATS, 0); + fdput(f); return error; } -EXPORT_SYMBOL(vfs_statx_fd); - -static inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, - int flags) -{ - if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | - AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0) - return -EINVAL; - - *lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; - if (flags & AT_SYMLINK_NOFOLLOW) - *lookup_flags &= ~LOOKUP_FOLLOW; - if (flags & AT_NO_AUTOMOUNT) - *lookup_flags &= ~LOOKUP_AUTOMOUNT; - if (flags & AT_EMPTY_PATH) - *lookup_flags |= LOOKUP_EMPTY; - - return 0; -} /** * vfs_statx - Get basic and extra attributes by filename @@ -189,15 +163,24 @@ static inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, * * 0 will be returned on success, and a -ve error code if unsuccessful. */ -int vfs_statx(int dfd, const char __user *filename, int flags, +static int vfs_statx(int dfd, const char __user *filename, int flags, struct kstat *stat, u32 request_mask) { struct path path; - int error = -EINVAL; - unsigned lookup_flags; + unsigned lookup_flags = 0; + int error; - if (vfs_stat_set_lookup_flags(&lookup_flags, flags)) + if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH | + AT_STATX_SYNC_TYPE)) return -EINVAL; + + if (!(flags & AT_SYMLINK_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + if (!(flags & AT_NO_AUTOMOUNT)) + lookup_flags |= LOOKUP_AUTOMOUNT; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) @@ -217,8 +200,13 @@ retry: out: return error; } -EXPORT_SYMBOL(vfs_statx); +int vfs_fstatat(int dfd, const char __user *filename, + struct kstat *stat, int flags) +{ + return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT, + stat, STATX_BASIC_STATS); +} #ifdef __ARCH_WANT_OLD_STAT diff --git a/fs/statfs.c b/fs/statfs.c index 2616424012ea..59f33752c131 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -29,6 +29,8 @@ static int flags_by_mnt(int mnt_flags) flags |= ST_NODIRATIME; if (mnt_flags & MNT_RELATIME) flags |= ST_RELATIME; + if (mnt_flags & MNT_NOSYMFOLLOW) + flags |= ST_NOSYMFOLLOW; return flags; } diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 02b1d9d0c182..be47263b8605 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -98,8 +98,7 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = sbi->s_ninodes; buf->f_ffree = sysv_count_free_inodes(sb); buf->f_namelen = SYSV_NAMELEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/udf/super.c b/fs/udf/super.c index faf2017ada11..5bef3a68395d 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -2414,8 +2414,7 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) + buf->f_bfree; buf->f_ffree = buf->f_bfree; buf->f_namelen = UDF_NAME_LEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/ufs/super.c b/fs/ufs/super.c index e3b69fb280e8..983558b572c7 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1431,8 +1431,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) ? (buf->f_bfree - uspi->s_root_blocks) : 0; buf->f_files = uspi->s_ncg * uspi->s_ipg; buf->f_namelen = UFS_MAXNAMLEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); mutex_unlock(&UFS_SB(sb)->s_lock); diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index e685299eb3d2..9fac5ea8d0e4 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -22,6 +22,31 @@ config XFS_FS system of your root partition is compiled as a module, you'll need to use an initial ramdisk (initrd) to boot. +config XFS_SUPPORT_V4 + bool "Support deprecated V4 (crc=0) format" + depends on XFS_FS + default y + help + The V4 filesystem format lacks certain features that are supported + by the V5 format, such as metadata checksumming, strengthened + metadata verification, and the ability to store timestamps past the + year 2038. Because of this, the V4 format is deprecated. All users + should upgrade by backing up their files, reformatting, and restoring + from the backup. + + Administrators and users can detect a V4 filesystem by running + xfs_info against a filesystem mountpoint and checking for a string + beginning with "crc=". If the string "crc=0" is found, the + filesystem is a V4 filesystem. If no such string is found, please + upgrade xfsprogs to the latest version and try again. + + This option will become default N in September 2025. Support for the + V4 format will be removed entirely in September 2030. Distributors + can say N here to withdraw support earlier. + + To continue supporting the old V4 format (crc=0), say Y. + To close off an attack surface, say N. + config XFS_QUOTA bool "XFS Quota support" depends on XFS_FS diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 3f80cede7406..48d8e9caf86f 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -96,8 +96,6 @@ xfs_attr3_rmt_verify( { struct xfs_attr3_rmt_hdr *rmt = ptr; - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return __this_address; if (!xfs_verify_magic(bp, rmt->rm_magic)) return __this_address; if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid)) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 1b0a01b06a05..d9a692484eae 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5046,20 +5046,25 @@ xfs_bmap_del_extent_real( flags = XFS_ILOG_CORE; if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { - xfs_fsblock_t bno; xfs_filblks_t len; xfs_extlen_t mod; - bno = div_u64_rem(del->br_startblock, mp->m_sb.sb_rextsize, - &mod); - ASSERT(mod == 0); len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize, &mod); ASSERT(mod == 0); - error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); - if (error) - goto done; + if (!(bflags & XFS_BMAPI_REMAP)) { + xfs_fsblock_t bno; + + bno = div_u64_rem(del->br_startblock, + mp->m_sb.sb_rextsize, &mod); + ASSERT(mod == 0); + + error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + if (error) + goto done; + } + do_fx = 0; nblks = len * mp->m_sb.sb_rextsize; qfield = XFS_TRANS_DQ_RTBCOUNT; diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index b40a4e80f5ee..b876b44c0204 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -15,8 +15,8 @@ */ #define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ #define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */ -#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ -#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ +#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ typedef struct xfs_da_blkinfo { __be32 forw; /* previous block in list */ @@ -35,8 +35,8 @@ typedef struct xfs_da_blkinfo { */ #define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */ #define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */ -#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */ -#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */ +#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v3 dirlf single blks */ +#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v3 dirlf multi blks */ struct xfs_da3_blkinfo { /* @@ -61,7 +61,7 @@ struct xfs_da3_blkinfo { * Since we have duplicate keys, use a binary search but always follow * all match in the block, not just the first match found. */ -#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ +#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ typedef struct xfs_da_node_hdr { struct xfs_da_blkinfo info; /* block type, links, etc. */ @@ -746,14 +746,14 @@ xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) */ static inline int xfs_attr_leaf_entsize_remote(int nlen) { - return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \ - XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); + return round_up(sizeof(struct xfs_attr_leaf_name_remote) - 1 + + nlen, XFS_ATTR_LEAF_NAME_ALIGN); } static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) { - return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + - XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); + return round_up(sizeof(struct xfs_attr_leaf_name_local) - 1 + + nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN); } static inline int xfs_attr_leaf_entsize_local_max(int bsize) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index d8f586256add..eff4a127188e 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -16,6 +16,8 @@ #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_log.h" /* * Deferred Operations in XFS @@ -186,8 +188,9 @@ xfs_defer_create_intent( { const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; - dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, - dfp->dfp_count, sort); + if (!dfp->dfp_intent) + dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, + dfp->dfp_count, sort); } /* @@ -312,22 +315,6 @@ xfs_defer_trans_roll( } /* - * Reset an already used dfops after finish. - */ -static void -xfs_defer_reset( - struct xfs_trans *tp) -{ - ASSERT(list_empty(&tp->t_dfops)); - - /* - * Low mode state transfers across transaction rolls to mirror dfops - * lifetime. Clear it now that dfops is reset. - */ - tp->t_flags &= ~XFS_TRANS_LOWMODE; -} - -/* * Free up any items left in the list. */ static void @@ -360,6 +347,58 @@ xfs_defer_cancel_list( } /* + * Prevent a log intent item from pinning the tail of the log by logging a + * done item to release the intent item; and then log a new intent item. + * The caller should provide a fresh transaction and roll it after we're done. + */ +static int +xfs_defer_relog( + struct xfs_trans **tpp, + struct list_head *dfops) +{ + struct xlog *log = (*tpp)->t_mountp->m_log; + struct xfs_defer_pending *dfp; + xfs_lsn_t threshold_lsn = NULLCOMMITLSN; + + + ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); + + list_for_each_entry(dfp, dfops, dfp_list) { + /* + * If the log intent item for this deferred op is not a part of + * the current log checkpoint, relog the intent item to keep + * the log tail moving forward. We're ok with this being racy + * because an incorrect decision means we'll be a little slower + * at pushing the tail. + */ + if (dfp->dfp_intent == NULL || + xfs_log_item_in_current_chkpt(dfp->dfp_intent)) + continue; + + /* + * Figure out where we need the tail to be in order to maintain + * the minimum required free space in the log. Only sample + * the log threshold once per call. + */ + if (threshold_lsn == NULLCOMMITLSN) { + threshold_lsn = xlog_grant_push_threshold(log, 0); + if (threshold_lsn == NULLCOMMITLSN) + break; + } + if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0) + continue; + + trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); + XFS_STATS_INC((*tpp)->t_mountp, defer_relog); + dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp); + } + + if ((*tpp)->t_flags & XFS_TRANS_DIRTY) + return xfs_defer_trans_roll(tpp); + return 0; +} + +/* * Log an intent-done item for the first pending intent, and finish the work * items. */ @@ -390,6 +429,7 @@ xfs_defer_finish_one( list_add(li, &dfp->dfp_work); dfp->dfp_count++; dfp->dfp_done = NULL; + dfp->dfp_intent = NULL; xfs_defer_create_intent(tp, dfp, false); } @@ -428,13 +468,27 @@ xfs_defer_finish_noroll( /* Until we run out of pending work to finish... */ while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) { + /* + * Deferred items that are created in the process of finishing + * other deferred work items should be queued at the head of + * the pending list, which puts them ahead of the deferred work + * that was created by the caller. This keeps the number of + * pending work items to a minimum, which decreases the amount + * of time that any one intent item can stick around in memory, + * pinning the log tail. + */ xfs_defer_create_intents(*tp); - list_splice_tail_init(&(*tp)->t_dfops, &dop_pending); + list_splice_init(&(*tp)->t_dfops, &dop_pending); error = xfs_defer_trans_roll(tp); if (error) goto out_shutdown; + /* Possibly relog intent items to keep the log moving. */ + error = xfs_defer_relog(tp, &dop_pending); + if (error) + goto out_shutdown; + dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, dfp_list); error = xfs_defer_finish_one(*tp, dfp); @@ -475,7 +529,10 @@ xfs_defer_finish( return error; } } - xfs_defer_reset(*tp); + + /* Reset LOWMODE now that we've finished all the dfops. */ + ASSERT(list_empty(&(*tp)->t_dfops)); + (*tp)->t_flags &= ~XFS_TRANS_LOWMODE; return 0; } @@ -549,6 +606,139 @@ xfs_defer_move( * that behavior. */ dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE); + stp->t_flags &= ~XFS_TRANS_LOWMODE; +} + +/* + * Prepare a chain of fresh deferred ops work items to be completed later. Log + * recovery requires the ability to put off until later the actual finishing + * work so that it can process unfinished items recovered from the log in + * correct order. + * + * Create and log intent items for all the work that we're capturing so that we + * can be assured that the items will get replayed if the system goes down + * before log recovery gets a chance to finish the work it put off. The entire + * deferred ops state is transferred to the capture structure and the + * transaction is then ready for the caller to commit it. If there are no + * intent items to capture, this function returns NULL. + * + * If capture_ip is not NULL, the capture structure will obtain an extra + * reference to the inode. + */ +static struct xfs_defer_capture * +xfs_defer_ops_capture( + struct xfs_trans *tp, + struct xfs_inode *capture_ip) +{ + struct xfs_defer_capture *dfc; + + if (list_empty(&tp->t_dfops)) + return NULL; + + /* Create an object to capture the defer ops. */ + dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS); + INIT_LIST_HEAD(&dfc->dfc_list); + INIT_LIST_HEAD(&dfc->dfc_dfops); + + xfs_defer_create_intents(tp); + + /* Move the dfops chain and transaction state to the capture struct. */ + list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); + dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; + tp->t_flags &= ~XFS_TRANS_LOWMODE; + + /* Capture the remaining block reservations along with the dfops. */ + dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used; + dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used; + + /* Preserve the log reservation size. */ + dfc->dfc_logres = tp->t_log_res; + + /* + * Grab an extra reference to this inode and attach it to the capture + * structure. + */ + if (capture_ip) { + ihold(VFS_I(capture_ip)); + dfc->dfc_capture_ip = capture_ip; + } + + return dfc; +} + +/* Release all resources that we used to capture deferred ops. */ +void +xfs_defer_ops_release( + struct xfs_mount *mp, + struct xfs_defer_capture *dfc) +{ + xfs_defer_cancel_list(mp, &dfc->dfc_dfops); + if (dfc->dfc_capture_ip) + xfs_irele(dfc->dfc_capture_ip); + kmem_free(dfc); +} + +/* + * Capture any deferred ops and commit the transaction. This is the last step + * needed to finish a log intent item that we recovered from the log. If any + * of the deferred ops operate on an inode, the caller must pass in that inode + * so that the reference can be transferred to the capture structure. The + * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling + * xfs_defer_ops_continue. + */ +int +xfs_defer_ops_capture_and_commit( + struct xfs_trans *tp, + struct xfs_inode *capture_ip, + struct list_head *capture_list) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_defer_capture *dfc; + int error; + + ASSERT(!capture_ip || xfs_isilocked(capture_ip, XFS_ILOCK_EXCL)); + + /* If we don't capture anything, commit transaction and exit. */ + dfc = xfs_defer_ops_capture(tp, capture_ip); + if (!dfc) + return xfs_trans_commit(tp); + + /* Commit the transaction and add the capture structure to the list. */ + error = xfs_trans_commit(tp); + if (error) { + xfs_defer_ops_release(mp, dfc); + return error; + } + + list_add_tail(&dfc->dfc_list, capture_list); + return 0; +} + +/* + * Attach a chain of captured deferred ops to a new transaction and free the + * capture structure. If an inode was captured, it will be passed back to the + * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0. + * The caller now owns the inode reference. + */ +void +xfs_defer_ops_continue( + struct xfs_defer_capture *dfc, + struct xfs_trans *tp, + struct xfs_inode **captured_ipp) +{ + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); + + /* Lock and join the captured inode to the new transaction. */ + if (dfc->dfc_capture_ip) { + xfs_ilock(dfc->dfc_capture_ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dfc->dfc_capture_ip, 0); + } + *captured_ipp = dfc->dfc_capture_ip; + + /* Move captured dfops chain and state to the transaction. */ + list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); + tp->t_flags |= dfc->dfc_tpflags; - xfs_defer_reset(stp); + kmem_free(dfc); } diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 6b2ca580f2b0..05472f71fffe 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -8,6 +8,7 @@ struct xfs_btree_cur; struct xfs_defer_op_type; +struct xfs_defer_capture; /* * Header for deferred operation list. @@ -63,4 +64,40 @@ extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; +/* + * This structure enables a dfops user to detach the chain of deferred + * operations from a transaction so that they can be continued later. + */ +struct xfs_defer_capture { + /* List of other capture structures. */ + struct list_head dfc_list; + + /* Deferred ops state saved from the transaction. */ + struct list_head dfc_dfops; + unsigned int dfc_tpflags; + + /* Block reservations for the data and rt devices. */ + unsigned int dfc_blkres; + unsigned int dfc_rtxres; + + /* Log reservation saved from the transaction. */ + unsigned int dfc_logres; + + /* + * An inode reference that must be maintained to complete the deferred + * work. + */ + struct xfs_inode *dfc_capture_ip; +}; + +/* + * Functions to capture a chain of deferred operations and continue them later. + * This doesn't normally happen except log recovery. + */ +int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp, + struct xfs_inode *capture_ip, struct list_head *capture_list); +void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp, + struct xfs_inode **captured_ipp); +void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d); + #endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 536666143fe7..ef5eaf33d146 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -17,7 +17,7 @@ struct xfs_dinode; */ struct xfs_icdinode { uint16_t di_flushiter; /* incremented on flush */ - uint32_t di_projid; /* owner's project id */ + prid_t di_projid; /* owner's project id */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 27c39268c31f..340c83f76c80 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2505,12 +2505,15 @@ xfs_rmap_map_extent( int whichfork, struct xfs_bmbt_irec *PREV) { + enum xfs_rmap_intent_type type = XFS_RMAP_MAP; + if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) return; - __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? - XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino, - whichfork, PREV); + if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) + type = XFS_RMAP_MAP_SHARED; + + __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); } /* Unmap an extent out of a file. */ @@ -2521,12 +2524,15 @@ xfs_rmap_unmap_extent( int whichfork, struct xfs_bmbt_irec *PREV) { + enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP; + if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) return; - __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? - XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino, - whichfork, PREV); + if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) + type = XFS_RMAP_UNMAP_SHARED; + + __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); } /* @@ -2543,12 +2549,15 @@ xfs_rmap_convert_extent( int whichfork, struct xfs_bmbt_irec *PREV) { + enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT; + if (!xfs_rmap_update_is_needed(mp, whichfork)) return; - __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? - XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino, - whichfork, PREV); + if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) + type = XFS_RMAP_CONVERT_SHARED; + + __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); } /* Schedule the creation of an rmap for non-file data. */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 1d9fa8a300f1..6c1aba16113c 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1018,7 +1018,6 @@ xfs_rtalloc_query_range( struct xfs_mount *mp = tp->t_mountp; xfs_rtblock_t rtstart; xfs_rtblock_t rtend; - xfs_rtblock_t rem; int is_free; int error = 0; @@ -1027,13 +1026,12 @@ xfs_rtalloc_query_range( if (low_rec->ar_startext >= mp->m_sb.sb_rextents || low_rec->ar_startext == high_rec->ar_startext) return 0; - if (high_rec->ar_startext > mp->m_sb.sb_rextents) - high_rec->ar_startext = mp->m_sb.sb_rextents; + high_rec->ar_startext = min(high_rec->ar_startext, + mp->m_sb.sb_rextents - 1); /* Iterate the bitmap, looking for discrepancies. */ rtstart = low_rec->ar_startext; - rem = high_rec->ar_startext - rtstart; - while (rem) { + while (rtstart <= high_rec->ar_startext) { /* Is the first block free? */ error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend, &is_free); @@ -1042,7 +1040,7 @@ xfs_rtalloc_query_range( /* How long does the extent go for? */ error = xfs_rtfind_forw(mp, tp, rtstart, - high_rec->ar_startext - 1, &rtend); + high_rec->ar_startext, &rtend); if (error) break; @@ -1055,7 +1053,6 @@ xfs_rtalloc_query_range( break; } - rem -= rtend - rtstart + 1; rtstart = rtend + 1; } diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index e56786f0a13c..653f3280e1c1 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -441,6 +441,20 @@ xchk_da_btree_block( goto out_freebp; } + /* + * If we've been handed a block that is below the dabtree root, does + * its hashval match what the parent block expected to see? + */ + if (level > 0) { + struct xfs_da_node_entry *key; + + key = xchk_da_btree_node_entry(ds, level - 1); + if (be32_to_cpu(key->hashval) != blk->hashval) { + xchk_da_set_corrupt(ds, level); + goto out_freebp; + } + } + out: return error; out_freebp: diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index ec3691372e7c..9e16a4d0f97c 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -24,6 +24,7 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_quota.h" kmem_zone_t *xfs_bui_zone; kmem_zone_t *xfs_bud_zone; @@ -423,30 +424,26 @@ const struct xfs_defer_op_type xfs_bmap_update_defer_type = { STATIC int xfs_bui_item_recover( struct xfs_log_item *lip, - struct xfs_trans *parent_tp) + struct list_head *capture_list) { struct xfs_bmbt_irec irec; struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_mount *mp = lip->li_mountp; struct xfs_map_extent *bmap; struct xfs_bud_log_item *budp; xfs_fsblock_t startblock_fsb; xfs_fsblock_t inode_fsb; xfs_filblks_t count; xfs_exntst_t state; - enum xfs_bmap_intent_type type; - bool op_ok; unsigned int bui_type; int whichfork; int error = 0; /* Only one mapping operation per BUI... */ - if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { - xfs_bui_release(buip); + if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) return -EFSCORRUPTED; - } /* * First check the validity of the extent described by the @@ -457,76 +454,58 @@ xfs_bui_item_recover( XFS_FSB_TO_DADDR(mp, bmap->me_startblock)); inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp, XFS_INO_TO_FSB(mp, bmap->me_owner))); - switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { + state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + switch (bui_type) { case XFS_BMAP_MAP: case XFS_BMAP_UNMAP: - op_ok = true; break; default: - op_ok = false; - break; + return -EFSCORRUPTED; } - if (!op_ok || startblock_fsb == 0 || + if (startblock_fsb == 0 || bmap->me_len == 0 || inode_fsb == 0 || startblock_fsb >= mp->m_sb.sb_dblocks || bmap->me_len >= mp->m_sb.sb_agblocks || inode_fsb >= mp->m_sb.sb_dblocks || - (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) { - /* - * This will pull the BUI from the AIL and - * free the memory associated with it. - */ - xfs_bui_release(buip); + (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) return -EFSCORRUPTED; - } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, - XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); + /* Grab the inode. */ + error = xfs_iget(mp, NULL, bmap->me_owner, 0, 0, &ip); if (error) return error; - /* - * Recovery stashes all deferred ops during intent processing and - * finishes them on completion. Transfer current dfops state to this - * transaction and transfer the result back before we return. - */ - xfs_defer_move(tp, parent_tp); - budp = xfs_trans_get_bud(tp, buip); - /* Grab the inode. */ - error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip); + error = xfs_qm_dqattach(ip); if (error) - goto err_inode; + goto err_rele; if (VFS_I(ip)->i_nlink == 0) xfs_iflags_set(ip, XFS_IRECOVERY); - /* Process deferred bmap item. */ - state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; - switch (bui_type) { - case XFS_BMAP_MAP: - case XFS_BMAP_UNMAP: - type = bui_type; - break; - default: - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); - error = -EFSCORRUPTED; - goto err_inode; - } + /* Allocate transaction and do the work. */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); + if (error) + goto err_rele; + + budp = xfs_trans_get_bud(tp, buip); + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); count = bmap->me_len; - error = xfs_trans_log_finish_bmap_update(tp, budp, type, ip, whichfork, - bmap->me_startoff, bmap->me_startblock, &count, state); + error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip, + whichfork, bmap->me_startoff, bmap->me_startblock, + &count, state); if (error) - goto err_inode; + goto err_cancel; if (count > 0) { - ASSERT(type == XFS_BMAP_UNMAP); + ASSERT(bui_type == XFS_BMAP_UNMAP); irec.br_startblock = bmap->me_startblock; irec.br_blockcount = count; irec.br_startoff = bmap->me_startoff; @@ -534,20 +513,24 @@ xfs_bui_item_recover( xfs_bmap_unmap_extent(tp, ip, &irec); } - xfs_defer_move(parent_tp, tp); - error = xfs_trans_commit(tp); + /* + * Commit transaction, which frees the transaction and saves the inode + * for later replay activities. + */ + error = xfs_defer_ops_capture_and_commit(tp, ip, capture_list); + if (error) + goto err_unlock; + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); + return 0; - return error; - -err_inode: - xfs_defer_move(parent_tp, tp); +err_cancel: xfs_trans_cancel(tp); - if (ip) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_irele(ip); - } +err_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); +err_rele: + xfs_irele(ip); return error; } @@ -559,6 +542,32 @@ xfs_bui_item_match( return BUI_ITEM(lip)->bui_format.bui_id == intent_id; } +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_bui_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_bud_log_item *budp; + struct xfs_bui_log_item *buip; + struct xfs_map_extent *extp; + unsigned int count; + + count = BUI_ITEM(intent)->bui_format.bui_nextents; + extp = BUI_ITEM(intent)->bui_format.bui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + budp = xfs_trans_get_bud(tp, BUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); + + buip = xfs_bui_init(tp->t_mountp); + memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp)); + atomic_set(&buip->bui_next_extent, count); + xfs_trans_add_item(tp, &buip->bui_item); + set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); + return &buip->bui_item; +} + static const struct xfs_item_ops xfs_bui_item_ops = { .iop_size = xfs_bui_item_size, .iop_format = xfs_bui_item_format, @@ -566,6 +575,7 @@ static const struct xfs_item_ops xfs_bui_item_ops = { .iop_release = xfs_bui_item_release, .iop_recover = xfs_bui_item_recover, .iop_match = xfs_bui_item_match, + .iop_relog = xfs_bui_item_relog, }; /* diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index f2a8a0e75e1f..7371a7f7c652 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -947,11 +947,11 @@ xfs_free_file_space( endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); /* We can only free complete realtime extents. */ - if (XFS_IS_REALTIME_INODE(ip)) { - xfs_extlen_t extsz = xfs_get_extsz_hint(ip); - - if ((startoffset_fsb | endoffset_fsb) & (extsz - 1)) - return -EINVAL; + if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) { + startoffset_fsb = roundup_64(startoffset_fsb, + mp->m_sb.sb_rextsize); + endoffset_fsb = rounddown_64(endoffset_fsb, + mp->m_sb.sb_rextsize); } /* @@ -1147,14 +1147,6 @@ xfs_insert_file_space( trace_xfs_insert_file_space(ip); - /* We can only insert complete realtime extents. */ - if (XFS_IS_REALTIME_INODE(ip)) { - xfs_extlen_t extsz = xfs_get_extsz_hint(ip); - - if ((stop_fsb | shift_fsb) & (extsz - 1)) - return -EINVAL; - } - error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb); if (error) return error; diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 24c7a8d11e1a..d44e8b4a3391 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -719,6 +719,8 @@ xlog_recover_get_buf_lsn( case XFS_ABTC_MAGIC: case XFS_RMAP_CRC_MAGIC: case XFS_REFC_CRC_MAGIC: + case XFS_FIBT_CRC_MAGIC: + case XFS_FIBT_MAGIC: case XFS_IBT_CRC_MAGIC: case XFS_IBT_MAGIC: { struct xfs_btree_block *btb = blk; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 3072814e407d..1d95ed387d66 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -831,8 +831,8 @@ xfs_qm_dqget_checks( } /* - * Given the file system, id, and type (UDQUOT/GDQUOT), return a locked - * dquot, doing an allocation (if requested) as needed. + * Given the file system, id, and type (UDQUOT/GDQUOT/PDQUOT), return a + * locked dquot, doing an allocation (if requested) as needed. */ int xfs_qm_dqget( diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6cb8cd11072a..6c11bfc3d452 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -585,10 +585,10 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { STATIC int xfs_efi_item_recover( struct xfs_log_item *lip, - struct xfs_trans *parent_tp) + struct list_head *capture_list) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_mount *mp = lip->li_mountp; struct xfs_efd_log_item *efdp; struct xfs_trans *tp; struct xfs_extent *extp; @@ -608,14 +608,8 @@ xfs_efi_item_recover( if (startblock_fsb == 0 || extp->ext_len == 0 || startblock_fsb >= mp->m_sb.sb_dblocks || - extp->ext_len >= mp->m_sb.sb_agblocks) { - /* - * This will pull the EFI from the AIL and - * free the memory associated with it. - */ - xfs_efi_release(efip); + extp->ext_len >= mp->m_sb.sb_agblocks) return -EFSCORRUPTED; - } } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); @@ -633,8 +627,7 @@ xfs_efi_item_recover( } - error = xfs_trans_commit(tp); - return error; + return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list); abort_error: xfs_trans_cancel(tp); @@ -649,6 +642,34 @@ xfs_efi_item_match( return EFI_ITEM(lip)->efi_format.efi_id == intent_id; } +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_efi_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_efd_log_item *efdp; + struct xfs_efi_log_item *efip; + struct xfs_extent *extp; + unsigned int count; + + count = EFI_ITEM(intent)->efi_format.efi_nextents; + extp = EFI_ITEM(intent)->efi_format.efi_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count); + efdp->efd_next_extent = count; + memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp)); + set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); + + efip = xfs_efi_init(tp->t_mountp, count); + memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp)); + atomic_set(&efip->efi_next_extent, count); + xfs_trans_add_item(tp, &efip->efi_item); + set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); + return &efip->efi_item; +} + static const struct xfs_item_ops xfs_efi_item_ops = { .iop_size = xfs_efi_item_size, .iop_format = xfs_efi_item_format, @@ -656,6 +677,7 @@ static const struct xfs_item_ops xfs_efi_item_ops = { .iop_release = xfs_efi_item_release, .iop_recover = xfs_efi_item_recover, .iop_match = xfs_efi_item_match, + .iop_relog = xfs_efi_item_relog, }; /* diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 3d1b95124744..5b0f93f73837 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -32,6 +32,39 @@ static const struct vm_operations_struct xfs_file_vm_ops; +/* + * Decide if the given file range is aligned to the size of the fundamental + * allocation unit for the file. + */ +static bool +xfs_is_falloc_aligned( + struct xfs_inode *ip, + loff_t pos, + long long int len) +{ + struct xfs_mount *mp = ip->i_mount; + uint64_t mask; + + if (XFS_IS_REALTIME_INODE(ip)) { + if (!is_power_of_2(mp->m_sb.sb_rextsize)) { + u64 rextbytes; + u32 mod; + + rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); + div_u64_rem(pos, rextbytes, &mod); + if (mod) + return false; + div_u64_rem(len, rextbytes, &mod); + return mod == 0; + } + mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1; + } else { + mask = mp->m_sb.sb_blocksize - 1; + } + + return !((pos | len) & mask); +} + int xfs_update_prealloc_flags( struct xfs_inode *ip, @@ -850,9 +883,7 @@ xfs_file_fallocate( if (error) goto out_unlock; } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { - unsigned int blksize_mask = i_blocksize(inode) - 1; - - if (offset & blksize_mask || len & blksize_mask) { + if (!xfs_is_falloc_aligned(ip, offset, len)) { error = -EINVAL; goto out_unlock; } @@ -872,10 +903,9 @@ xfs_file_fallocate( if (error) goto out_unlock; } else if (mode & FALLOC_FL_INSERT_RANGE) { - unsigned int blksize_mask = i_blocksize(inode) - 1; loff_t isize = i_size_read(inode); - if (offset & blksize_mask || len & blksize_mask) { + if (!xfs_is_falloc_aligned(ip, offset, len)) { error = -EINVAL; goto out_unlock; } diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 1a88025e68a3..db23e455eb91 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -33,39 +33,7 @@ enum xfs_fstrm_alloc { /* * Allocation group filestream associations are tracked with per-ag atomic * counters. These counters allow xfs_filestream_pick_ag() to tell whether a - * particular AG already has active filestreams associated with it. The mount - * point's m_peraglock is used to protect these counters from per-ag array - * re-allocation during a growfs operation. When xfs_growfs_data_private() is - * about to reallocate the array, it calls xfs_filestream_flush() with the - * m_peraglock held in write mode. - * - * Since xfs_mru_cache_flush() guarantees that all the free functions for all - * the cache elements have finished executing before it returns, it's safe for - * the free functions to use the atomic counters without m_peraglock protection. - * This allows the implementation of xfs_fstrm_free_func() to be agnostic about - * whether it was called with the m_peraglock held in read mode, write mode or - * not held at all. The race condition this addresses is the following: - * - * - The work queue scheduler fires and pulls a filestream directory cache - * element off the LRU end of the cache for deletion, then gets pre-empted. - * - A growfs operation grabs the m_peraglock in write mode, flushes all the - * remaining items from the cache and reallocates the mount point's per-ag - * array, resetting all the counters to zero. - * - The work queue thread resumes and calls the free function for the element - * it started cleaning up earlier. In the process it decrements the - * filestreams counter for an AG that now has no references. - * - * With a shrinkfs feature, the above scenario could panic the system. - * - * All other uses of the following macros should be protected by either the - * m_peraglock held in read mode, or the cache's internal locking exposed by the - * interval between a call to xfs_mru_cache_lookup() and a call to - * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode - * when new elements are added to the cache. - * - * Combined, these locking rules ensure that no associations will ever exist in - * the cache that reference per-ag array elements that have since been - * reallocated. + * particular AG already has active filestreams associated with it. */ int xfs_filestream_peek_ag( diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 4eebcec4aae6..9ce5e7d5bf8f 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -26,7 +26,7 @@ #include "xfs_rtalloc.h" /* Convert an xfs_fsmap to an fsmap. */ -void +static void xfs_fsmap_from_internal( struct fsmap *dest, struct xfs_fsmap *src) @@ -155,8 +155,7 @@ xfs_fsmap_owner_from_rmap( /* getfsmap query state */ struct xfs_getfsmap_info { struct xfs_fsmap_head *head; - xfs_fsmap_format_t formatter; /* formatting fn */ - void *format_arg; /* format buffer */ + struct fsmap *fsmap_recs; /* mapping records */ struct xfs_buf *agf_bp; /* AGF, for refcount queries */ xfs_daddr_t next_daddr; /* next daddr we expect */ u64 missing_owner; /* owner of holes */ @@ -224,6 +223,20 @@ xfs_getfsmap_is_shared( return 0; } +static inline void +xfs_getfsmap_format( + struct xfs_mount *mp, + struct xfs_fsmap *xfm, + struct xfs_getfsmap_info *info) +{ + struct fsmap *rec; + + trace_xfs_getfsmap_mapping(mp, xfm); + + rec = &info->fsmap_recs[info->head->fmh_entries++]; + xfs_fsmap_from_internal(rec, xfm); +} + /* * Format a reverse mapping for getfsmap, having translated rm_startblock * into the appropriate daddr units. @@ -256,6 +269,9 @@ xfs_getfsmap_helper( /* Are we just counting mappings? */ if (info->head->fmh_count == 0) { + if (info->head->fmh_entries == UINT_MAX) + return -ECANCELED; + if (rec_daddr > info->next_daddr) info->head->fmh_entries++; @@ -285,10 +301,7 @@ xfs_getfsmap_helper( fmr.fmr_offset = 0; fmr.fmr_length = rec_daddr - info->next_daddr; fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; - error = info->formatter(&fmr, info->format_arg); - if (error) - return error; - info->head->fmh_entries++; + xfs_getfsmap_format(mp, &fmr, info); } if (info->last) @@ -320,11 +333,8 @@ xfs_getfsmap_helper( if (shared) fmr.fmr_flags |= FMR_OF_SHARED; } - error = info->formatter(&fmr, info->format_arg); - if (error) - return error; - info->head->fmh_entries++; + xfs_getfsmap_format(mp, &fmr, info); out: rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); if (info->next_daddr < rec_daddr) @@ -792,11 +802,11 @@ xfs_getfsmap_check_keys( #endif /* CONFIG_XFS_RT */ /* - * Get filesystem's extents as described in head, and format for - * output. Calls formatter to fill the user's buffer until all - * extents are mapped, until the passed-in head->fmh_count slots have - * been filled, or until the formatter short-circuits the loop, if it - * is tracking filled-in extents on its own. + * Get filesystem's extents as described in head, and format for output. Fills + * in the supplied records array until there are no more reverse mappings to + * return or head.fmh_entries == head.fmh_count. In the second case, this + * function returns -ECANCELED to indicate that more records would have been + * returned. * * Key to Confusion * ---------------- @@ -816,8 +826,7 @@ int xfs_getfsmap( struct xfs_mount *mp, struct xfs_fsmap_head *head, - xfs_fsmap_format_t formatter, - void *arg) + struct fsmap *fsmap_recs) { struct xfs_trans *tp = NULL; struct xfs_fsmap dkeys[2]; /* per-dev keys */ @@ -892,8 +901,7 @@ xfs_getfsmap( info.next_daddr = head->fmh_keys[0].fmr_physical + head->fmh_keys[0].fmr_length; - info.formatter = formatter; - info.format_arg = arg; + info.fsmap_recs = fsmap_recs; info.head = head; /* diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h index c6c57739b862..a0775788e7b1 100644 --- a/fs/xfs/xfs_fsmap.h +++ b/fs/xfs/xfs_fsmap.h @@ -27,13 +27,9 @@ struct xfs_fsmap_head { struct xfs_fsmap fmh_keys[2]; /* low and high keys */ }; -void xfs_fsmap_from_internal(struct fsmap *dest, struct xfs_fsmap *src); void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src); -/* fsmap to userspace formatter - copy to user & advance pointer */ -typedef int (*xfs_fsmap_format_t)(struct xfs_fsmap *, void *); - int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head, - xfs_fsmap_format_t formatter, void *arg); + struct fsmap *out_recs); #endif /* __XFS_FSMAP_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 49624973eecc..2bfbcf28b1bd 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -698,6 +698,68 @@ out_unlock: return error; } +/* Propagate di_flags from a parent inode to a child inode. */ +static void +xfs_inode_inherit_flags( + struct xfs_inode *ip, + const struct xfs_inode *pip) +{ + unsigned int di_flags = 0; + umode_t mode = VFS_I(ip)->i_mode; + + if (S_ISDIR(mode)) { + if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + ip->i_d.di_extsize = pip->i_d.di_extsize; + } + if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + } else if (S_ISREG(mode)) { + if ((pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) && + xfs_sb_version_hasrealtime(&ip->i_mount->m_sb)) + di_flags |= XFS_DIFLAG_REALTIME; + if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSIZE; + ip->i_d.di_extsize = pip->i_d.di_extsize; + } + } + if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && + xfs_inherit_noatime) + di_flags |= XFS_DIFLAG_NOATIME; + if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && + xfs_inherit_nodump) + di_flags |= XFS_DIFLAG_NODUMP; + if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && + xfs_inherit_sync) + di_flags |= XFS_DIFLAG_SYNC; + if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && + xfs_inherit_nosymlinks) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && + xfs_inherit_nodefrag) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + + ip->i_d.di_flags |= di_flags; +} + +/* Propagate di_flags2 from a parent inode to a child inode. */ +static void +xfs_inode_inherit_flags2( + struct xfs_inode *ip, + const struct xfs_inode *pip) +{ + if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { + ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; + } + if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX; +} + /* * Allocate an inode on disk and return a copy of its in-core version. * The in-core inode is locked exclusively. Set mode, nlink, and rdev @@ -841,54 +903,10 @@ xfs_ialloc( break; case S_IFREG: case S_IFDIR: - if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { - uint di_flags = 0; - - if (S_ISDIR(mode)) { - if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) - di_flags |= XFS_DIFLAG_RTINHERIT; - if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { - di_flags |= XFS_DIFLAG_EXTSZINHERIT; - ip->i_d.di_extsize = pip->i_d.di_extsize; - } - if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - di_flags |= XFS_DIFLAG_PROJINHERIT; - } else if (S_ISREG(mode)) { - if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) - di_flags |= XFS_DIFLAG_REALTIME; - if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { - di_flags |= XFS_DIFLAG_EXTSIZE; - ip->i_d.di_extsize = pip->i_d.di_extsize; - } - } - if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && - xfs_inherit_noatime) - di_flags |= XFS_DIFLAG_NOATIME; - if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && - xfs_inherit_nodump) - di_flags |= XFS_DIFLAG_NODUMP; - if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && - xfs_inherit_sync) - di_flags |= XFS_DIFLAG_SYNC; - if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && - xfs_inherit_nosymlinks) - di_flags |= XFS_DIFLAG_NOSYMLINKS; - if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && - xfs_inherit_nodefrag) - di_flags |= XFS_DIFLAG_NODEFRAG; - if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) - di_flags |= XFS_DIFLAG_FILESTREAM; - - ip->i_d.di_flags |= di_flags; - } - if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) { - if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { - ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; - ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; - } - if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) - ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX; - } + if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) + xfs_inode_inherit_flags(ip, pip); + if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) + xfs_inode_inherit_flags2(ip, pip); /* FALLTHROUGH */ case S_IFLNK: ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; @@ -1516,17 +1534,10 @@ xfs_itruncate_extents_flags( if (error) goto out; - /* - * Duplicate the transaction that has the permanent - * reservation and commit the old transaction. - */ + /* free the just unmapped extents */ error = xfs_defer_finish(&tp); if (error) goto out; - - error = xfs_trans_roll_inode(&tp, ip); - if (error) - goto out; } if (whichfork == XFS_DATA_FORK) { diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index bca7659fb5c6..3fbd98f61ea5 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1716,39 +1716,17 @@ out_free_buf: return error; } -struct getfsmap_info { - struct xfs_mount *mp; - struct fsmap_head __user *data; - unsigned int idx; - __u32 last_flags; -}; - -STATIC int -xfs_getfsmap_format(struct xfs_fsmap *xfm, void *priv) -{ - struct getfsmap_info *info = priv; - struct fsmap fm; - - trace_xfs_getfsmap_mapping(info->mp, xfm); - - info->last_flags = xfm->fmr_flags; - xfs_fsmap_from_internal(&fm, xfm); - if (copy_to_user(&info->data->fmh_recs[info->idx++], &fm, - sizeof(struct fsmap))) - return -EFAULT; - - return 0; -} - STATIC int xfs_ioc_getfsmap( struct xfs_inode *ip, struct fsmap_head __user *arg) { - struct getfsmap_info info = { NULL }; struct xfs_fsmap_head xhead = {0}; struct fsmap_head head; - bool aborted = false; + struct fsmap *recs; + unsigned int count; + __u32 last_flags = 0; + bool done = false; int error; if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) @@ -1760,38 +1738,112 @@ xfs_ioc_getfsmap( sizeof(head.fmh_keys[1].fmr_reserved))) return -EINVAL; + /* + * Use an internal memory buffer so that we don't have to copy fsmap + * data to userspace while holding locks. Start by trying to allocate + * up to 128k for the buffer, but fall back to a single page if needed. + */ + count = min_t(unsigned int, head.fmh_count, + 131072 / sizeof(struct fsmap)); + recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL); + if (!recs) { + count = min_t(unsigned int, head.fmh_count, + PAGE_SIZE / sizeof(struct fsmap)); + recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL); + if (!recs) + return -ENOMEM; + } + xhead.fmh_iflags = head.fmh_iflags; - xhead.fmh_count = head.fmh_count; xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]); xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]); trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]); trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]); - info.mp = ip->i_mount; - info.data = arg; - error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info); - if (error == -ECANCELED) { - error = 0; - aborted = true; - } else if (error) - return error; + head.fmh_entries = 0; + do { + struct fsmap __user *user_recs; + struct fsmap *last_rec; + + user_recs = &arg->fmh_recs[head.fmh_entries]; + xhead.fmh_entries = 0; + xhead.fmh_count = min_t(unsigned int, count, + head.fmh_count - head.fmh_entries); + + /* Run query, record how many entries we got. */ + error = xfs_getfsmap(ip->i_mount, &xhead, recs); + switch (error) { + case 0: + /* + * There are no more records in the result set. Copy + * whatever we got to userspace and break out. + */ + done = true; + break; + case -ECANCELED: + /* + * The internal memory buffer is full. Copy whatever + * records we got to userspace and go again if we have + * not yet filled the userspace buffer. + */ + error = 0; + break; + default: + goto out_free; + } + head.fmh_entries += xhead.fmh_entries; + head.fmh_oflags = xhead.fmh_oflags; - /* If we didn't abort, set the "last" flag in the last fmx */ - if (!aborted && info.idx) { - info.last_flags |= FMR_OF_LAST; - if (copy_to_user(&info.data->fmh_recs[info.idx - 1].fmr_flags, - &info.last_flags, sizeof(info.last_flags))) - return -EFAULT; + /* + * If the caller wanted a record count or there aren't any + * new records to return, we're done. + */ + if (head.fmh_count == 0 || xhead.fmh_entries == 0) + break; + + /* Copy all the records we got out to userspace. */ + if (copy_to_user(user_recs, recs, + xhead.fmh_entries * sizeof(struct fsmap))) { + error = -EFAULT; + goto out_free; + } + + /* Remember the last record flags we copied to userspace. */ + last_rec = &recs[xhead.fmh_entries - 1]; + last_flags = last_rec->fmr_flags; + + /* Set up the low key for the next iteration. */ + xfs_fsmap_to_internal(&xhead.fmh_keys[0], last_rec); + trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]); + } while (!done && head.fmh_entries < head.fmh_count); + + /* + * If there are no more records in the query result set and we're not + * in counting mode, mark the last record returned with the LAST flag. + */ + if (done && head.fmh_count > 0 && head.fmh_entries > 0) { + struct fsmap __user *user_rec; + + last_flags |= FMR_OF_LAST; + user_rec = &arg->fmh_recs[head.fmh_entries - 1]; + + if (copy_to_user(&user_rec->fmr_flags, &last_flags, + sizeof(last_flags))) { + error = -EFAULT; + goto out_free; + } } /* copy back header */ - head.fmh_entries = xhead.fmh_entries; - head.fmh_oflags = xhead.fmh_oflags; - if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) - return -EFAULT; + if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) { + error = -EFAULT; + goto out_free; + } - return 0; +out_free: + kmem_free(recs); + return error; } STATIC int diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 80a13c8561d8..5e165456da68 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -237,7 +237,7 @@ xfs_vn_create( umode_t mode, bool flags) { - return xfs_vn_mknod(dir, dentry, mode, 0); + return xfs_generic_create(dir, dentry, mode, 0, false); } STATIC int @@ -246,7 +246,7 @@ xfs_vn_mkdir( struct dentry *dentry, umode_t mode) { - return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0); + return xfs_generic_create(dir, dentry, mode | S_IFDIR, 0, false); } STATIC struct dentry * diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index ab737fed7b12..5b7a1e201559 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -123,7 +123,6 @@ typedef __u32 xfs_nlink_t; #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ -#define SYNCHRONIZE() barrier() #define __return_address __builtin_return_address(0) /* @@ -176,6 +175,12 @@ static inline xfs_dev_t linux_to_xfs_dev_t(dev_t dev) #define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL) #define xfs_stack_trace() dump_stack() +static inline uint64_t rounddown_64(uint64_t x, uint32_t y) +{ + do_div(x, y); + return x * y; +} + static inline uint64_t roundup_64(uint64_t x, uint32_t y) { x += y - 1; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ad0c69ee8947..fa2d05e65ff1 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1475,14 +1475,14 @@ xlog_commit_record( } /* - * Push on the buffer cache code if we ever use more than 75% of the on-disk - * log space. This code pushes on the lsn which would supposedly free up - * the 25% which we want to leave free. We may need to adopt a policy which - * pushes on an lsn which is further along in the log once we reach the high - * water mark. In this manner, we would be creating a low water mark. + * Compute the LSN that we'd need to push the log tail towards in order to have + * (a) enough on-disk log space to log the number of bytes specified, (b) at + * least 25% of the log space free, and (c) at least 256 blocks free. If the + * log free space already meets all three thresholds, this function returns + * NULLCOMMITLSN. */ -STATIC void -xlog_grant_push_ail( +xfs_lsn_t +xlog_grant_push_threshold( struct xlog *log, int need_bytes) { @@ -1508,7 +1508,7 @@ xlog_grant_push_ail( free_threshold = max(free_threshold, (log->l_logBBsize >> 2)); free_threshold = max(free_threshold, 256); if (free_blocks >= free_threshold) - return; + return NULLCOMMITLSN; xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, &threshold_block); @@ -1528,13 +1528,33 @@ xlog_grant_push_ail( if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) threshold_lsn = last_sync_lsn; + return threshold_lsn; +} + +/* + * Push the tail of the log if we need to do so to maintain the free log space + * thresholds set out by xlog_grant_push_threshold. We may need to adopt a + * policy which pushes on an lsn which is further along in the log once we + * reach the high water mark. In this manner, we would be creating a low water + * mark. + */ +STATIC void +xlog_grant_push_ail( + struct xlog *log, + int need_bytes) +{ + xfs_lsn_t threshold_lsn; + + threshold_lsn = xlog_grant_push_threshold(log, need_bytes); + if (threshold_lsn == NULLCOMMITLSN || XLOG_FORCED_SHUTDOWN(log)) + return; + /* * Get the transaction layer to kick the dirty buffers out to * disk asynchronously. No point in trying to do this if * the filesystem is shutting down. */ - if (!XLOG_FORCED_SHUTDOWN(log)) - xfs_ail_push(log->l_ailp, threshold_lsn); + xfs_ail_push(log->l_ailp, threshold_lsn); } /* @@ -1604,9 +1624,7 @@ xlog_cksum( int i; int xheads; - xheads = size / XLOG_HEADER_CYCLE_SIZE; - if (size % XLOG_HEADER_CYCLE_SIZE) - xheads++; + xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE); for (i = 1; i < xheads; i++) { crc = crc32c(crc, &xhdr[i].hic_xheader, diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 1412d6993f1e..58c3fcbec94a 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -141,4 +141,6 @@ void xfs_log_quiesce(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); bool xfs_log_in_recovery(struct xfs_mount *); +xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes); + #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a17d788921d6..87886b7f77da 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -371,6 +371,19 @@ out: return error; } +static inline int +xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh) +{ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + int h_size = be32_to_cpu(rh->h_size); + + if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) && + h_size > XLOG_HEADER_CYCLE_SIZE) + return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE); + } + return 1; +} + /* * Potentially backup over partial log record write. * @@ -463,15 +476,7 @@ xlog_find_verify_log_record( * reset last_blk. Only when last_blk points in the middle of a log * record do we update last_blk. */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - uint h_size = be32_to_cpu(head->h_size); - - xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE; - if (h_size % XLOG_HEADER_CYCLE_SIZE) - xhdrs++; - } else { - xhdrs = 1; - } + xhdrs = xlog_logrec_hblks(log, head); if (*last_blk - i + extra_bblks != BTOBB(be32_to_cpu(head->h_len)) + xhdrs) @@ -1158,22 +1163,7 @@ xlog_check_unmount_rec( * below. We won't want to clear the unmount record if there is one, so * we pass the lsn of the unmount record rather than the block after it. */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - int h_size = be32_to_cpu(rhead->h_size); - int h_version = be32_to_cpu(rhead->h_version); - - if ((h_version & XLOG_VERSION_2) && - (h_size > XLOG_HEADER_CYCLE_SIZE)) { - hblks = h_size / XLOG_HEADER_CYCLE_SIZE; - if (h_size % XLOG_HEADER_CYCLE_SIZE) - hblks++; - } else { - hblks = 1; - } - } else { - hblks = 1; - } - + hblks = xlog_logrec_hblks(log, rhead); after_umount_blk = xlog_wrap_logbno(log, rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len))); @@ -2444,44 +2434,66 @@ xlog_recover_process_data( /* Take all the collected deferred ops and finish them in order. */ static int xlog_finish_defer_ops( - struct xfs_trans *parent_tp) + struct xfs_mount *mp, + struct list_head *capture_list) { - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_defer_capture *dfc, *next; struct xfs_trans *tp; - int64_t freeblks; - uint resblks; - int error; + struct xfs_inode *ip; + int error = 0; - /* - * We're finishing the defer_ops that accumulated as a result of - * recovering unfinished intent items during log recovery. We - * reserve an itruncate transaction because it is the largest - * permanent transaction type. Since we're the only user of the fs - * right now, take 93% (15/16) of the available free blocks. Use - * weird math to avoid a 64-bit division. - */ - freeblks = percpu_counter_sum(&mp->m_fdblocks); - if (freeblks <= 0) - return -ENOSPC; - resblks = min_t(int64_t, UINT_MAX, freeblks); - resblks = (resblks * 15) >> 4; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks, - 0, XFS_TRANS_RESERVE, &tp); - if (error) - return error; - /* transfer all collected dfops to this transaction */ - xfs_defer_move(tp, parent_tp); + list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { + struct xfs_trans_res resv; - return xfs_trans_commit(tp); + /* + * Create a new transaction reservation from the captured + * information. Set logcount to 1 to force the new transaction + * to regrant every roll so that we can make forward progress + * in recovery no matter how full the log might be. + */ + resv.tr_logres = dfc->dfc_logres; + resv.tr_logcount = 1; + resv.tr_logflags = XFS_TRANS_PERM_LOG_RES; + + error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres, + dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + /* + * Transfer to this new transaction all the dfops we captured + * from recovering a single intent item. + */ + list_del_init(&dfc->dfc_list); + xfs_defer_ops_continue(dfc, tp, &ip); + + error = xfs_trans_commit(tp); + if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_irele(ip); + } + if (error) + return error; + } + + ASSERT(list_empty(capture_list)); + return 0; } -/* Is this log item a deferred action intent? */ -static inline bool xlog_item_is_intent(struct xfs_log_item *lip) +/* Release all the captured defer ops and capture structures in this list. */ +static void +xlog_abort_defer_ops( + struct xfs_mount *mp, + struct list_head *capture_list) { - return lip->li_ops->iop_recover != NULL && - lip->li_ops->iop_match != NULL; -} + struct xfs_defer_capture *dfc; + struct xfs_defer_capture *next; + list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { + list_del_init(&dfc->dfc_list); + xfs_defer_ops_release(mp, dfc); + } +} /* * When this is called, all of the log intent items which did not have * corresponding log done items should be in the AIL. What we do now @@ -2502,35 +2514,23 @@ STATIC int xlog_recover_process_intents( struct xlog *log) { - struct xfs_trans *parent_tp; + LIST_HEAD(capture_list); struct xfs_ail_cursor cur; struct xfs_log_item *lip; struct xfs_ail *ailp; - int error; + int error = 0; #if defined(DEBUG) || defined(XFS_WARN) xfs_lsn_t last_lsn; #endif - /* - * The intent recovery handlers commit transactions to complete recovery - * for individual intents, but any new deferred operations that are - * queued during that process are held off until the very end. The - * purpose of this transaction is to serve as a container for deferred - * operations. Each intent recovery handler must transfer dfops here - * before its local transaction commits, and we'll finish the entire - * list below. - */ - error = xfs_trans_alloc_empty(log->l_mp, &parent_tp); - if (error) - return error; - ailp = log->l_ailp; spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); #if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); #endif - while (lip != NULL) { + for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + lip != NULL; + lip = xfs_trans_ail_cursor_next(ailp, &cur)) { /* * We're done when we see something other than an intent. * There should be no intents left in the AIL now. @@ -2552,26 +2552,29 @@ xlog_recover_process_intents( /* * NOTE: If your intent processing routine can create more - * deferred ops, you /must/ attach them to the transaction in - * this routine or else those subsequent intents will get + * deferred ops, you /must/ attach them to the capture list in + * the recover routine or else those subsequent intents will be * replayed in the wrong order! */ - if (!test_and_set_bit(XFS_LI_RECOVERED, &lip->li_flags)) { - spin_unlock(&ailp->ail_lock); - error = lip->li_ops->iop_recover(lip, parent_tp); - spin_lock(&ailp->ail_lock); - } + spin_unlock(&ailp->ail_lock); + error = lip->li_ops->iop_recover(lip, &capture_list); + spin_lock(&ailp->ail_lock); if (error) - goto out; - lip = xfs_trans_ail_cursor_next(ailp, &cur); + break; } -out: + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->ail_lock); - if (!error) - error = xlog_finish_defer_ops(parent_tp); - xfs_trans_cancel(parent_tp); + if (error) + goto err; + error = xlog_finish_defer_ops(log->l_mp, &capture_list); + if (error) + goto err; + + return 0; +err: + xlog_abort_defer_ops(log->l_mp, &capture_list); return error; } @@ -2878,7 +2881,8 @@ STATIC int xlog_valid_rec_header( struct xlog *log, struct xlog_rec_header *rhead, - xfs_daddr_t blkno) + xfs_daddr_t blkno, + int bufsize) { int hlen; @@ -2894,10 +2898,14 @@ xlog_valid_rec_header( return -EFSCORRUPTED; } - /* LR body must have data or it wouldn't have been written */ + /* + * LR body must have data (or it wouldn't have been written) + * and h_len must not be greater than LR buffer size. + */ hlen = be32_to_cpu(rhead->h_len); - if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > INT_MAX)) + if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > bufsize)) return -EFSCORRUPTED; + if (XFS_IS_CORRUPT(log->l_mp, blkno > log->l_logBBsize || blkno > INT_MAX)) return -EFSCORRUPTED; @@ -2958,9 +2966,6 @@ xlog_do_recovery_pass( goto bread_err1; rhead = (xlog_rec_header_t *)offset; - error = xlog_valid_rec_header(log, rhead, tail_blk); - if (error) - goto bread_err1; /* * xfsprogs has a bug where record length is based on lsunit but @@ -2975,30 +2980,22 @@ xlog_do_recovery_pass( */ h_size = be32_to_cpu(rhead->h_size); h_len = be32_to_cpu(rhead->h_len); - if (h_len > h_size) { - if (h_len <= log->l_mp->m_logbsize && - be32_to_cpu(rhead->h_num_logops) == 1) { - xfs_warn(log->l_mp, + if (h_len > h_size && h_len <= log->l_mp->m_logbsize && + rhead->h_num_logops == cpu_to_be32(1)) { + xfs_warn(log->l_mp, "invalid iclog size (%d bytes), using lsunit (%d bytes)", - h_size, log->l_mp->m_logbsize); - h_size = log->l_mp->m_logbsize; - } else { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, - log->l_mp); - error = -EFSCORRUPTED; - goto bread_err1; - } + h_size, log->l_mp->m_logbsize); + h_size = log->l_mp->m_logbsize; } - if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && - (h_size > XLOG_HEADER_CYCLE_SIZE)) { - hblks = h_size / XLOG_HEADER_CYCLE_SIZE; - if (h_size % XLOG_HEADER_CYCLE_SIZE) - hblks++; + error = xlog_valid_rec_header(log, rhead, tail_blk, h_size); + if (error) + goto bread_err1; + + hblks = xlog_logrec_hblks(log, rhead); + if (hblks != 1) { kmem_free(hbp); hbp = xlog_alloc_buffer(log, hblks); - } else { - hblks = 1; } } else { ASSERT(log->l_sectBBsize == 1); @@ -3070,7 +3067,7 @@ xlog_do_recovery_pass( } rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, - split_hblks ? blk_no : 0); + split_hblks ? blk_no : 0, h_size); if (error) goto bread_err2; @@ -3151,7 +3148,7 @@ xlog_do_recovery_pass( goto bread_err2; rhead = (xlog_rec_header_t *)offset; - error = xlog_valid_rec_header(log, rhead, blk_no); + error = xlog_valid_rec_header(log, rhead, blk_no, h_size); if (error) goto bread_err2; @@ -3449,6 +3446,14 @@ xlog_recover_finish( int error; error = xlog_recover_process_intents(log); if (error) { + /* + * Cancel all the unprocessed intent items now so that + * we don't leave them pinned in the AIL. This can + * cause the AIL to livelock on the pinned item if + * anyone tries to push the AIL (inode reclaim does + * this) before we get around to xfs_log_mount_cancel. + */ + xlog_recover_cancel_intents(log); xfs_alert(log->l_mp, "Failed to recover intents"); return error; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 3f82e0c92c2d..b2a9abee8b2b 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -249,7 +249,6 @@ xfs_qm_unmount_quotas( STATIC int xfs_qm_dqattach_one( struct xfs_inode *ip, - xfs_dqid_t id, xfs_dqtype_t type, bool doalloc, struct xfs_dquot **IO_idqpp) @@ -330,23 +329,23 @@ xfs_qm_dqattach_locked( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { - error = xfs_qm_dqattach_one(ip, i_uid_read(VFS_I(ip)), - XFS_DQTYPE_USER, doalloc, &ip->i_udquot); + error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER, + doalloc, &ip->i_udquot); if (error) goto done; ASSERT(ip->i_udquot); } if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { - error = xfs_qm_dqattach_one(ip, i_gid_read(VFS_I(ip)), - XFS_DQTYPE_GROUP, doalloc, &ip->i_gdquot); + error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_GROUP, + doalloc, &ip->i_gdquot); if (error) goto done; ASSERT(ip->i_gdquot); } if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQTYPE_PROJ, + error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_PROJ, doalloc, &ip->i_pdquot); if (error) goto done; @@ -1663,6 +1662,7 @@ xfs_qm_vop_dqalloc( } if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { + ASSERT(O_udqpp); if (!uid_eq(inode->i_uid, uid)) { /* * What we need is the dquot that has this uid, and @@ -1696,6 +1696,7 @@ xfs_qm_vop_dqalloc( } } if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { + ASSERT(O_gdqpp); if (!gid_eq(inode->i_gid, gid)) { xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, from_kgid(user_ns, gid), @@ -1713,9 +1714,10 @@ xfs_qm_vop_dqalloc( } } if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { + ASSERT(O_pdqpp); if (ip->i_d.di_projid != prid) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, + error = xfs_qm_dqget(mp, prid, XFS_DQTYPE_PROJ, true, &pq); if (error) { ASSERT(error != -ENOENT); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index ca93b6488377..7529eb63ce94 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -424,7 +424,7 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = { STATIC int xfs_cui_item_recover( struct xfs_log_item *lip, - struct xfs_trans *parent_tp) + struct list_head *capture_list) { struct xfs_bmbt_irec irec; struct xfs_cui_log_item *cuip = CUI_ITEM(lip); @@ -432,7 +432,7 @@ xfs_cui_item_recover( struct xfs_cud_log_item *cudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_mount *mp = lip->li_mountp; xfs_fsblock_t startblock_fsb; xfs_fsblock_t new_fsb; xfs_extlen_t new_len; @@ -467,14 +467,8 @@ xfs_cui_item_recover( refc->pe_len == 0 || startblock_fsb >= mp->m_sb.sb_dblocks || refc->pe_len >= mp->m_sb.sb_agblocks || - (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) { - /* - * This will pull the CUI from the AIL and - * free the memory associated with it. - */ - xfs_cui_release(cuip); + (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) return -EFSCORRUPTED; - } } /* @@ -493,12 +487,7 @@ xfs_cui_item_recover( mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp); if (error) return error; - /* - * Recovery stashes all deferred ops during intent processing and - * finishes them on completion. Transfer current dfops state to this - * transaction and transfer the result back before we return. - */ - xfs_defer_move(tp, parent_tp); + cudp = xfs_trans_get_cud(tp, cuip); for (i = 0; i < cuip->cui_format.cui_nextents; i++) { @@ -555,13 +544,10 @@ xfs_cui_item_recover( } xfs_refcount_finish_one_cleanup(tp, rcur, error); - xfs_defer_move(parent_tp, tp); - error = xfs_trans_commit(tp); - return error; + return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list); abort_error: xfs_refcount_finish_one_cleanup(tp, rcur, error); - xfs_defer_move(parent_tp, tp); xfs_trans_cancel(tp); return error; } @@ -574,6 +560,32 @@ xfs_cui_item_match( return CUI_ITEM(lip)->cui_format.cui_id == intent_id; } +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_cui_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_cud_log_item *cudp; + struct xfs_cui_log_item *cuip; + struct xfs_phys_extent *extp; + unsigned int count; + + count = CUI_ITEM(intent)->cui_format.cui_nextents; + extp = CUI_ITEM(intent)->cui_format.cui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); + + cuip = xfs_cui_init(tp->t_mountp, count); + memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp)); + atomic_set(&cuip->cui_next_extent, count); + xfs_trans_add_item(tp, &cuip->cui_item); + set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); + return &cuip->cui_item; +} + static const struct xfs_item_ops xfs_cui_item_ops = { .iop_size = xfs_cui_item_size, .iop_format = xfs_cui_item_format, @@ -581,6 +593,7 @@ static const struct xfs_item_ops xfs_cui_item_ops = { .iop_release = xfs_cui_item_release, .iop_recover = xfs_cui_item_recover, .iop_match = xfs_cui_item_match, + .iop_relog = xfs_cui_item_relog, }; /* diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index dc5b0753cd51..7adc996ca6e3 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -467,14 +467,14 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = { STATIC int xfs_rui_item_recover( struct xfs_log_item *lip, - struct xfs_trans *parent_tp) + struct list_head *capture_list) { struct xfs_rui_log_item *ruip = RUI_ITEM(lip); struct xfs_map_extent *rmap; struct xfs_rud_log_item *rudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_mount *mp = lip->li_mountp; xfs_fsblock_t startblock_fsb; enum xfs_rmap_intent_type type; xfs_exntst_t state; @@ -511,14 +511,8 @@ xfs_rui_item_recover( rmap->me_len == 0 || startblock_fsb >= mp->m_sb.sb_dblocks || rmap->me_len >= mp->m_sb.sb_agblocks || - (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)) { - /* - * This will pull the RUI from the AIL and - * free the memory associated with it. - */ - xfs_rui_release(ruip); + (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)) return -EFSCORRUPTED; - } } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, @@ -573,8 +567,7 @@ xfs_rui_item_recover( } xfs_rmap_finish_one_cleanup(tp, rcur, error); - error = xfs_trans_commit(tp); - return error; + return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list); abort_error: xfs_rmap_finish_one_cleanup(tp, rcur, error); @@ -590,6 +583,32 @@ xfs_rui_item_match( return RUI_ITEM(lip)->rui_format.rui_id == intent_id; } +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_rui_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_rud_log_item *rudp; + struct xfs_rui_log_item *ruip; + struct xfs_map_extent *extp; + unsigned int count; + + count = RUI_ITEM(intent)->rui_format.rui_nextents; + extp = RUI_ITEM(intent)->rui_format.rui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); + + ruip = xfs_rui_init(tp->t_mountp, count); + memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp)); + atomic_set(&ruip->rui_next_extent, count); + xfs_trans_add_item(tp, &ruip->rui_item); + set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); + return &ruip->rui_item; +} + static const struct xfs_item_ops xfs_rui_item_ops = { .iop_size = xfs_rui_item_size, .iop_format = xfs_rui_item_format, @@ -597,6 +616,7 @@ static const struct xfs_item_ops xfs_rui_item_ops = { .iop_release = xfs_rui_item_release, .iop_recover = xfs_rui_item_recover, .iop_match = xfs_rui_item_match, + .iop_relog = xfs_rui_item_relog, }; /* diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 5b89c12f1566..ede1baf31413 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -18,7 +18,7 @@ #include "xfs_trans_space.h" #include "xfs_icache.h" #include "xfs_rtalloc.h" - +#include "xfs_sb.h" /* * Read and return the summary information for a given extent size, @@ -778,8 +778,14 @@ xfs_growfs_rt_alloc( struct xfs_bmbt_irec map; /* block map output */ int nmap; /* number of block maps */ int resblks; /* space reservation */ + enum xfs_blft buf_type; struct xfs_trans *tp; + if (ip == mp->m_rsumip) + buf_type = XFS_BLFT_RTSUMMARY_BUF; + else + buf_type = XFS_BLFT_RTBITMAP_BUF; + /* * Allocate space to the file, as necessary. */ @@ -841,6 +847,9 @@ xfs_growfs_rt_alloc( mp->m_bsize, 0, &bp); if (error) goto out_trans_cancel; + + xfs_trans_buf_set_type(tp, bp, buf_type); + bp->b_ops = &xfs_rtbuf_ops; memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); /* @@ -1015,23 +1024,29 @@ xfs_growfs_rt( /* * Lock out other callers by grabbing the bitmap inode lock. */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); /* - * Update the bitmap inode's size. + * Update the bitmap inode's size ondisk and incore. We need + * to update the incore size so that inode inactivation won't + * punch what it thinks are "posteof" blocks. */ mp->m_rbmip->i_d.di_size = nsbp->sb_rbmblocks * nsbp->sb_blocksize; + i_size_write(VFS_I(mp->m_rbmip), mp->m_rbmip->i_d.di_size); xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); /* * Get the summary inode into the transaction. */ - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); /* - * Update the summary inode's size. + * Update the summary inode's size. We need to update the + * incore size so that inode inactivation won't punch what it + * thinks are "posteof" blocks. */ mp->m_rsumip->i_d.di_size = nmp->m_rsumsize; + i_size_write(VFS_I(mp->m_rsumip), mp->m_rsumip->i_d.di_size); xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE); /* * Copy summary data from old to new sizes. @@ -1087,7 +1102,13 @@ error_cancel: if (error) break; } + if (error) + goto out_free; + + /* Update secondary superblocks now the physical grow has completed */ + error = xfs_update_secondary_sbs(mp); +out_free: /* * Free the fake mp structure. */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index f70f1255220b..20e0534a772c 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -23,6 +23,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) uint64_t xs_xstrat_bytes = 0; uint64_t xs_write_bytes = 0; uint64_t xs_read_bytes = 0; + uint64_t defer_relog = 0; static const struct xstats_entry { char *desc; @@ -70,10 +71,13 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes; xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes; xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes; + defer_relog += per_cpu_ptr(stats, i)->s.defer_relog; } len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); + len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n", + defer_relog); len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n", #if defined(DEBUG) 1); diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 34d704f703d2..43ffba74f045 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -137,6 +137,7 @@ struct __xfsstats { uint64_t xs_xstrat_bytes; uint64_t xs_write_bytes; uint64_t xs_read_bytes; + uint64_t defer_relog; }; #define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t)) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index baf5de30eebb..e3e229e52512 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -794,8 +794,7 @@ xfs_fs_statfs( statp->f_namelen = MAXNAMELEN - 1; id = huge_encode_dev(mp->m_ddev_targp->bt_dev); - statp->f_fsid.val[0] = (u32)id; - statp->f_fsid.val[1] = (u32)(id >> 32); + statp->f_fsid = u64_to_fsid(id); icount = percpu_counter_sum(&mp->m_icount); ifree = percpu_counter_sum(&mp->m_ifree); @@ -1234,25 +1233,12 @@ xfs_fc_parse_param( case Opt_nouuid: mp->m_flags |= XFS_MOUNT_NOUUID; return 0; - case Opt_ikeep: - mp->m_flags |= XFS_MOUNT_IKEEP; - return 0; - case Opt_noikeep: - mp->m_flags &= ~XFS_MOUNT_IKEEP; - return 0; case Opt_largeio: mp->m_flags |= XFS_MOUNT_LARGEIO; return 0; case Opt_nolargeio: mp->m_flags &= ~XFS_MOUNT_LARGEIO; return 0; - case Opt_attr2: - mp->m_flags |= XFS_MOUNT_ATTR2; - return 0; - case Opt_noattr2: - mp->m_flags &= ~XFS_MOUNT_ATTR2; - mp->m_flags |= XFS_MOUNT_NOATTR2; - return 0; case Opt_filestreams: mp->m_flags |= XFS_MOUNT_FILESTREAMS; return 0; @@ -1304,6 +1290,24 @@ xfs_fc_parse_param( xfs_mount_set_dax_mode(mp, result.uint_32); return 0; #endif + /* Following mount options will be removed in September 2025 */ + case Opt_ikeep: + xfs_warn(mp, "%s mount option is deprecated.", param->key); + mp->m_flags |= XFS_MOUNT_IKEEP; + return 0; + case Opt_noikeep: + xfs_warn(mp, "%s mount option is deprecated.", param->key); + mp->m_flags &= ~XFS_MOUNT_IKEEP; + return 0; + case Opt_attr2: + xfs_warn(mp, "%s mount option is deprecated.", param->key); + mp->m_flags |= XFS_MOUNT_ATTR2; + return 0; + case Opt_noattr2: + xfs_warn(mp, "%s mount option is deprecated.", param->key); + mp->m_flags &= ~XFS_MOUNT_ATTR2; + mp->m_flags |= XFS_MOUNT_NOATTR2; + return 0; default: xfs_warn(mp, "unknown mount option [%s].", param->key); return -EINVAL; @@ -1450,6 +1454,19 @@ xfs_fc_fill_super( if (error) goto out_free_sb; + /* V4 support is undergoing deprecation. */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) { +#ifdef CONFIG_XFS_SUPPORT_V4 + xfs_warn_once(mp, + "Deprecated V4 format (crc=0) will not be supported after September 2030."); +#else + xfs_warn(mp, + "Deprecated V4 format (crc=0) not supported by kernel."); + error = -EINVAL; + goto out_free_sb; +#endif + } + /* * XFS block mappings use 54 bits to store the logical block offset. * This should suffice to handle the maximum file size that the VFS diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 021ef96d0542..fac9de7ee6d0 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -50,13 +50,45 @@ xfs_panic_mask_proc_handler( } #endif /* CONFIG_PROC_FS */ +STATIC int +xfs_deprecate_irix_sgid_inherit_proc_handler( + struct ctl_table *ctl, + int write, + void *buffer, + size_t *lenp, + loff_t *ppos) +{ + if (write) { + printk_once(KERN_WARNING + "XFS: " "%s sysctl option is deprecated.\n", + ctl->procname); + } + return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); +} + +STATIC int +xfs_deprecate_irix_symlink_mode_proc_handler( + struct ctl_table *ctl, + int write, + void *buffer, + size_t *lenp, + loff_t *ppos) +{ + if (write) { + printk_once(KERN_WARNING + "XFS: " "%s sysctl option is deprecated.\n", + ctl->procname); + } + return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); +} + static struct ctl_table xfs_table[] = { { .procname = "irix_sgid_inherit", .data = &xfs_params.sgid_inherit.val, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = xfs_deprecate_irix_sgid_inherit_proc_handler, .extra1 = &xfs_params.sgid_inherit.min, .extra2 = &xfs_params.sgid_inherit.max }, @@ -65,7 +97,7 @@ static struct ctl_table xfs_table[] = { .data = &xfs_params.symlink_mode.val, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = xfs_deprecate_irix_symlink_mode_proc_handler, .extra1 = &xfs_params.symlink_mode.min, .extra2 = &xfs_params.symlink_mode.max }, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index dcdcf99cfa5d..86951652d3ed 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2533,6 +2533,7 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent); DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent); #define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index ca18a040336a..c94e71f741b6 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -959,7 +959,7 @@ xfs_trans_cancel( struct xfs_log_item *lip; list_for_each_entry(lip, &tp->t_items, li_trans) - ASSERT(!(lip->li_type == XFS_LI_EFD)); + ASSERT(!xlog_item_is_intent_done(lip)); } #endif xfs_trans_unreserve_and_mod_sb(tp); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index f46534b75236..084658946cc8 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -55,14 +55,12 @@ struct xfs_log_item { #define XFS_LI_ABORTED 1 #define XFS_LI_FAILED 2 #define XFS_LI_DIRTY 3 /* log item dirty in transaction */ -#define XFS_LI_RECOVERED 4 /* log intent item has been recovered */ #define XFS_LI_FLAGS \ { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \ { (1 << XFS_LI_ABORTED), "ABORTED" }, \ { (1 << XFS_LI_FAILED), "FAILED" }, \ - { (1 << XFS_LI_DIRTY), "DIRTY" }, \ - { (1 << XFS_LI_RECOVERED), "RECOVERED" } + { (1 << XFS_LI_DIRTY), "DIRTY" } struct xfs_item_ops { unsigned flags; @@ -74,10 +72,29 @@ struct xfs_item_ops { void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn); void (*iop_release)(struct xfs_log_item *); xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); - int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp); + int (*iop_recover)(struct xfs_log_item *lip, + struct list_head *capture_list); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); + struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, + struct xfs_trans *tp); }; +/* Is this log item a deferred action intent? */ +static inline bool +xlog_item_is_intent(struct xfs_log_item *lip) +{ + return lip->li_ops->iop_recover != NULL && + lip->li_ops->iop_match != NULL; +} + +/* Is this a log intent-done item? */ +static inline bool +xlog_item_is_intent_done(struct xfs_log_item *lip) +{ + return lip->li_ops->iop_unpin == NULL && + lip->li_ops->iop_push == NULL; +} + /* * Release the log item as soon as committed. This is for items just logging * intents that never need to be written back in place. @@ -243,4 +260,12 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, extern kmem_zone_t *xfs_trans_zone; +static inline struct xfs_log_item * +xfs_trans_item_relog( + struct xfs_log_item *lip, + struct xfs_trans *tp) +{ + return lip->li_ops->iop_relog(lip, tp); +} + #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 133fc6fc3edd..fe45b0c3970c 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -221,36 +221,27 @@ xfs_trans_mod_dquot( } switch (field) { - - /* - * regular disk blk reservation - */ - case XFS_TRANS_DQ_RES_BLKS: + /* regular disk blk reservation */ + case XFS_TRANS_DQ_RES_BLKS: qtrx->qt_blk_res += delta; break; - /* - * inode reservation - */ - case XFS_TRANS_DQ_RES_INOS: + /* inode reservation */ + case XFS_TRANS_DQ_RES_INOS: qtrx->qt_ino_res += delta; break; - /* - * disk blocks used. - */ - case XFS_TRANS_DQ_BCOUNT: + /* disk blocks used. */ + case XFS_TRANS_DQ_BCOUNT: qtrx->qt_bcount_delta += delta; break; - case XFS_TRANS_DQ_DELBCOUNT: + case XFS_TRANS_DQ_DELBCOUNT: qtrx->qt_delbcnt_delta += delta; break; - /* - * Inode Count - */ - case XFS_TRANS_DQ_ICOUNT: + /* Inode Count */ + case XFS_TRANS_DQ_ICOUNT: if (qtrx->qt_ino_res && delta > 0) { qtrx->qt_ino_res_used += delta; ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used); @@ -258,17 +249,13 @@ xfs_trans_mod_dquot( qtrx->qt_icount_delta += delta; break; - /* - * rtblk reservation - */ - case XFS_TRANS_DQ_RES_RTBLKS: + /* rtblk reservation */ + case XFS_TRANS_DQ_RES_RTBLKS: qtrx->qt_rtblk_res += delta; break; - /* - * rtblk count - */ - case XFS_TRANS_DQ_RTBCOUNT: + /* rtblk count */ + case XFS_TRANS_DQ_RTBCOUNT: if (qtrx->qt_rtblk_res && delta > 0) { qtrx->qt_rtblk_res_used += delta; ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used); @@ -276,11 +263,11 @@ xfs_trans_mod_dquot( qtrx->qt_rtbcount_delta += delta; break; - case XFS_TRANS_DQ_DELRTBCOUNT: + case XFS_TRANS_DQ_DELRTBCOUNT: qtrx->qt_delrtb_delta += delta; break; - default: + default: ASSERT(0); } diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 8ec7c8f109d7..ff5930be096c 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -24,6 +24,39 @@ #include "zonefs.h" +static inline int zonefs_zone_mgmt(struct inode *inode, + enum req_opf op) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + int ret; + + lockdep_assert_held(&zi->i_truncate_mutex); + + ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, + zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); + if (ret) { + zonefs_err(inode->i_sb, + "Zone management operation %s at %llu failed %d\n", + blk_op_str(op), zi->i_zsector, ret); + return ret; + } + + return 0; +} + +static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + i_size_write(inode, isize); + /* + * A full zone is no longer open/active and does not need + * explicit closing. + */ + if (isize >= zi->i_max_size) + zi->i_flags &= ~ZONEFS_ZONE_OPEN; +} + static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) @@ -302,6 +335,17 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, } /* + * If the filesystem is mounted with the explicit-open mount option, we + * need to clear the ZONEFS_ZONE_OPEN flag if the zone transitioned to + * the read-only or offline condition, to avoid attempting an explicit + * close of the zone when the inode file is closed. + */ + if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && + (zone->cond == BLK_ZONE_COND_OFFLINE || + zone->cond == BLK_ZONE_COND_READONLY)) + zi->i_flags &= ~ZONEFS_ZONE_OPEN; + + /* * If error=remount-ro was specified, any error result in remounting * the volume as read-only. */ @@ -315,7 +359,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * invalid data. */ zonefs_update_stats(inode, data_size); - i_size_write(inode, data_size); + zonefs_i_size_write(inode, data_size); zi->i_wpoffset = data_size; return 0; @@ -328,7 +372,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * eventually correct the file size and zonefs inode write pointer offset * (which can be out of sync with the drive due to partial write failures). */ -static void zonefs_io_error(struct inode *inode, bool write) +static void __zonefs_io_error(struct inode *inode, bool write) { struct zonefs_inode_info *zi = ZONEFS_I(inode); struct super_block *sb = inode->i_sb; @@ -342,8 +386,6 @@ static void zonefs_io_error(struct inode *inode, bool write) }; int ret; - mutex_lock(&zi->i_truncate_mutex); - /* * Memory allocations in blkdev_report_zones() can trigger a memory * reclaim which may in turn cause a recursion into zonefs as well as @@ -359,7 +401,14 @@ static void zonefs_io_error(struct inode *inode, bool write) zonefs_err(sb, "Get inode %lu zone information failed %d\n", inode->i_ino, ret); memalloc_noio_restore(noio_flag); +} +static void zonefs_io_error(struct inode *inode, bool write) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + mutex_lock(&zi->i_truncate_mutex); + __zonefs_io_error(inode, write); mutex_unlock(&zi->i_truncate_mutex); } @@ -397,13 +446,27 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) if (isize == old_isize) goto unlock; - ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, - zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); - if (ret) { - zonefs_err(inode->i_sb, - "Zone management operation at %llu failed %d", - zi->i_zsector, ret); + ret = zonefs_zone_mgmt(inode, op); + if (ret) goto unlock; + + /* + * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, + * take care of open zones. + */ + if (zi->i_flags & ZONEFS_ZONE_OPEN) { + /* + * Truncating a zone to EMPTY or FULL is the equivalent of + * closing the zone. For a truncation to 0, we need to + * re-open the zone to ensure new writes can be processed. + * For a truncation to the maximum file size, the zone is + * closed and writes cannot be accepted anymore, so clear + * the open flag. + */ + if (!isize) + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + else + zi->i_flags &= ~ZONEFS_ZONE_OPEN; } zonefs_update_stats(inode, isize); @@ -584,7 +647,7 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, mutex_lock(&zi->i_truncate_mutex); if (i_size_read(inode) < iocb->ki_pos + size) { zonefs_update_stats(inode, iocb->ki_pos + size); - i_size_write(inode, iocb->ki_pos + size); + zonefs_i_size_write(inode, iocb->ki_pos + size); } mutex_unlock(&zi->i_truncate_mutex); } @@ -865,8 +928,128 @@ inode_unlock: return ret; } +static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + + if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN)) + return false; + + if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + return false; + + if (!(file->f_mode & FMODE_WRITE)) + return false; + + return true; +} + +static int zonefs_open_zone(struct inode *inode) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); + + zi->i_wr_refcnt++; + if (zi->i_wr_refcnt == 1) { + + if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) { + atomic_dec(&sbi->s_open_zones); + ret = -EBUSY; + goto unlock; + } + + if (i_size_read(inode) < zi->i_max_size) { + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + if (ret) { + zi->i_wr_refcnt--; + atomic_dec(&sbi->s_open_zones); + goto unlock; + } + zi->i_flags |= ZONEFS_ZONE_OPEN; + } + } + +unlock: + mutex_unlock(&zi->i_truncate_mutex); + + return ret; +} + +static int zonefs_file_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = generic_file_open(inode, file); + if (ret) + return ret; + + if (zonefs_file_use_exp_open(inode, file)) + return zonefs_open_zone(inode); + + return 0; +} + +static void zonefs_close_zone(struct inode *inode) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); + zi->i_wr_refcnt--; + if (!zi->i_wr_refcnt) { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + struct super_block *sb = inode->i_sb; + + /* + * If the file zone is full, it is not open anymore and we only + * need to decrement the open count. + */ + if (!(zi->i_flags & ZONEFS_ZONE_OPEN)) + goto dec; + + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); + if (ret) { + __zonefs_io_error(inode, false); + /* + * Leaving zones explicitly open may lead to a state + * where most zones cannot be written (zone resources + * exhausted). So take preventive action by remounting + * read-only. + */ + if (zi->i_flags & ZONEFS_ZONE_OPEN && + !(sb->s_flags & SB_RDONLY)) { + zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n"); + sb->s_flags |= SB_RDONLY; + } + } + zi->i_flags &= ~ZONEFS_ZONE_OPEN; +dec: + atomic_dec(&sbi->s_open_zones); + } + mutex_unlock(&zi->i_truncate_mutex); +} + +static int zonefs_file_release(struct inode *inode, struct file *file) +{ + /* + * If we explicitly open a zone we must close it again as well, but the + * zone management operation can fail (either due to an IO error or as + * the zone has gone offline or read-only). Make sure we don't fail the + * close(2) for user-space. + */ + if (zonefs_file_use_exp_open(inode, file)) + zonefs_close_zone(inode); + + return 0; +} + static const struct file_operations zonefs_file_operations = { - .open = generic_file_open, + .open = zonefs_file_open, + .release = zonefs_file_release, .fsync = zonefs_file_fsync, .mmap = zonefs_file_mmap, .llseek = zonefs_file_llseek, @@ -890,6 +1073,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); init_rwsem(&zi->i_mmap_sem); + zi->i_wr_refcnt = 0; return &zi->i_vnode; } @@ -932,15 +1116,14 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) fsid = le64_to_cpup((void *)sbi->s_uuid.b) ^ le64_to_cpup((void *)sbi->s_uuid.b + sizeof(u64)); - buf->f_fsid.val[0] = (u32)fsid; - buf->f_fsid.val[1] = (u32)(fsid >> 32); + buf->f_fsid = u64_to_fsid(fsid); return 0; } enum { Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair, - Opt_err, + Opt_explicit_open, Opt_err, }; static const match_table_t tokens = { @@ -948,6 +1131,7 @@ static const match_table_t tokens = { { Opt_errors_zro, "errors=zone-ro"}, { Opt_errors_zol, "errors=zone-offline"}, { Opt_errors_repair, "errors=repair"}, + { Opt_explicit_open, "explicit-open" }, { Opt_err, NULL} }; @@ -984,6 +1168,9 @@ static int zonefs_parse_options(struct super_block *sb, char *options) sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR; break; + case Opt_explicit_open: + sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN; + break; default: return -EINVAL; } @@ -1403,6 +1590,13 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_gid = GLOBAL_ROOT_GID; sbi->s_perm = 0640; sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; + sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev); + atomic_set(&sbi->s_open_zones, 0); + if (!sbi->s_max_open_zones && + sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { + zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n"); + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; + } ret = zonefs_read_super(sb); if (ret) diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 55b39970acb2..51141907097c 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -38,6 +38,8 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) return ZONEFS_ZTYPE_SEQ; } +#define ZONEFS_ZONE_OPEN (1 << 0) + /* * In-memory inode data. */ @@ -74,6 +76,10 @@ struct zonefs_inode_info { */ struct mutex i_truncate_mutex; struct rw_semaphore i_mmap_sem; + + /* guarded by i_truncate_mutex */ + unsigned int i_wr_refcnt; + unsigned int i_flags; }; static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) @@ -154,6 +160,7 @@ enum zonefs_features { #define ZONEFS_MNTOPT_ERRORS_MASK \ (ZONEFS_MNTOPT_ERRORS_RO | ZONEFS_MNTOPT_ERRORS_ZRO | \ ZONEFS_MNTOPT_ERRORS_ZOL | ZONEFS_MNTOPT_ERRORS_REPAIR) +#define ZONEFS_MNTOPT_EXPLICIT_OPEN (1 << 4) /* Explicit open/close of zones on open/close */ /* * In-memory Super block information. @@ -175,6 +182,9 @@ struct zonefs_sb_info { loff_t s_blocks; loff_t s_used_blocks; + + unsigned int s_max_open_zones; + atomic_t s_open_zones; }; static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) |